github.com/matrixorigin/matrixone@v1.2.0/pkg/taskservice/daemon_task.go (about)

     1  // Copyright 2021 - 2023 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package taskservice
    16  
    17  import (
    18  	"context"
    19  	"strings"
    20  	"sync/atomic"
    21  	"time"
    22  
    23  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    24  	"github.com/matrixorigin/matrixone/pkg/pb/task"
    25  	"go.uber.org/zap"
    26  )
    27  
    28  type TaskHandler interface {
    29  	Handle(ctx context.Context) error
    30  }
    31  
    32  type startTask struct {
    33  	runner *taskRunner
    34  	task   *daemonTask
    35  }
    36  
    37  func newStartTask(r *taskRunner, t *daemonTask) *startTask {
    38  	return &startTask{
    39  		runner: r,
    40  		task:   t,
    41  	}
    42  }
    43  
    44  func (t *startTask) Handle(_ context.Context) error {
    45  	if err := t.runner.stopper.RunTask(func(ctx context.Context) {
    46  		defer t.runner.removeDaemonTask(t.task.task.ID)
    47  
    48  		ok, err := t.runner.startDaemonTask(ctx, t.task)
    49  		if err != nil {
    50  			t.runner.setDaemonTaskError(ctx, t.task, err)
    51  			return
    52  		}
    53  
    54  		// ok value is false, means that the task cannot be started by
    55  		// this runner, maybe it has been started by another runner.
    56  		if !ok {
    57  			return
    58  		}
    59  
    60  		// Start the go-routine to execute the task. It hangs here until
    61  		// the task encounters some error or be canceled.
    62  		if err := t.task.executor(ctx, &t.task.task); err != nil {
    63  			// set the record of this task error message.
    64  			t.runner.setDaemonTaskError(ctx, t.task, err)
    65  		}
    66  	}); err != nil {
    67  		return err
    68  	}
    69  	return nil
    70  }
    71  
    72  type resumeTask struct {
    73  	runner *taskRunner
    74  	task   *daemonTask
    75  }
    76  
    77  func newResumeTask(r *taskRunner, t *daemonTask) *resumeTask {
    78  	return &resumeTask{
    79  		runner: r,
    80  		task:   t,
    81  	}
    82  }
    83  
    84  func (t *resumeTask) Handle(ctx context.Context) error {
    85  	ctx, cancel := context.WithTimeout(ctx, time.Second*5)
    86  	defer cancel()
    87  	tasks, err := t.runner.service.QueryDaemonTask(ctx, WithTaskIDCond(EQ, t.task.task.ID))
    88  	if err != nil {
    89  		return err
    90  	}
    91  	if len(tasks) != 1 {
    92  		return moerr.NewInternalError(ctx, "count of tasks is wrong %d", len(tasks))
    93  	}
    94  
    95  	tk := tasks[0]
    96  	// We cannot resume a task which is not on local runner.
    97  	if !strings.EqualFold(tk.TaskRunner, t.runner.runnerID) {
    98  		return moerr.NewInternalError(ctx, "the task is not on local runner, prev runner %s, "+
    99  			"local runner %s", tk.TaskRunner, t.runner.runnerID)
   100  	}
   101  
   102  	tk.TaskStatus = task.TaskStatus_Running
   103  	nowTime := time.Now()
   104  	tk.LastRun = nowTime
   105  	tk.LastHeartbeat = nowTime
   106  	_, err = t.runner.service.UpdateDaemonTask(ctx, []task.DaemonTask{tk})
   107  	if err != nil {
   108  		return err
   109  	}
   110  
   111  	ar := t.task.activeRoutine.Load()
   112  	if ar == nil || *ar == nil {
   113  		return moerr.NewInternalError(ctx, "cannot handle resume operation, "+
   114  			"active routine not set for task %d", t.task.task.ID)
   115  	}
   116  	return (*ar).Resume()
   117  }
   118  
   119  type pauseTask struct {
   120  	runner *taskRunner
   121  	task   *daemonTask
   122  }
   123  
   124  func newPauseTask(r *taskRunner, t *daemonTask) *pauseTask {
   125  	return &pauseTask{
   126  		runner: r,
   127  		task:   t,
   128  	}
   129  }
   130  
   131  func (t *pauseTask) Handle(ctx context.Context) error {
   132  	ctx, cancel := context.WithTimeout(ctx, time.Second*5)
   133  	defer cancel()
   134  	tasks, err := t.runner.service.QueryDaemonTask(ctx, WithTaskIDCond(EQ, t.task.task.ID))
   135  	if err != nil {
   136  		return err
   137  	}
   138  	if len(tasks) != 1 {
   139  		return moerr.NewInternalError(ctx, "count of tasks is wrong %d", len(tasks))
   140  	}
   141  
   142  	tk := tasks[0]
   143  	tk.TaskStatus = task.TaskStatus_Paused
   144  	_, err = t.runner.service.UpdateDaemonTask(ctx, []task.DaemonTask{tk})
   145  	if err != nil {
   146  		return err
   147  	}
   148  
   149  	if t.runner.exists(tk.ID) {
   150  		ar := t.task.activeRoutine.Load()
   151  		if ar == nil || *ar == nil {
   152  			return moerr.NewInternalError(ctx, "cannot handle pause operation, "+
   153  				"active routine not set for task %d", t.task.task.ID)
   154  		}
   155  		if err := (*ar).Pause(); err != nil {
   156  			return err
   157  		}
   158  	}
   159  	return nil
   160  }
   161  
   162  type cancelTask struct {
   163  	runner *taskRunner
   164  	task   *daemonTask
   165  }
   166  
   167  func newCancelTask(r *taskRunner, t *daemonTask) *cancelTask {
   168  	return &cancelTask{
   169  		runner: r,
   170  		task:   t,
   171  	}
   172  }
   173  
   174  func (t *cancelTask) Handle(ctx context.Context) error {
   175  	ctx, cancel := context.WithTimeout(ctx, time.Second*5)
   176  	defer cancel()
   177  	tasks, err := t.runner.service.QueryDaemonTask(ctx, WithTaskIDCond(EQ, t.task.task.ID))
   178  	if err != nil {
   179  		return err
   180  	}
   181  	if len(tasks) != 1 {
   182  		return moerr.NewInternalError(ctx, "count of tasks is wrong %d", len(tasks))
   183  	}
   184  
   185  	tk := tasks[0]
   186  	tk.TaskStatus = task.TaskStatus_Canceled
   187  	tk.EndAt = time.Now()
   188  	_, err = t.runner.service.UpdateDaemonTask(ctx, []task.DaemonTask{tk})
   189  	if err != nil {
   190  		return err
   191  	}
   192  	if t.runner.exists(tk.ID) {
   193  		ar := t.task.activeRoutine.Load()
   194  		if ar == nil || *ar == nil {
   195  			return moerr.NewInternalError(ctx, "cannot handle cancel operation, "+
   196  				"active routine not set for task %d", t.task.task.ID)
   197  		}
   198  		return (*ar).Cancel()
   199  	}
   200  	return nil
   201  }
   202  
   203  // ActiveRoutine is an interface that the go routine of the daemon task
   204  // should implement.
   205  type ActiveRoutine interface {
   206  	// Resume resumes the go routine of the daemon task.
   207  	Resume() error
   208  	// Pause pauses the go routine of the daemon task.
   209  	Pause() error
   210  	// Cancel cancels the go routine of the daemon task.
   211  	Cancel() error
   212  }
   213  
   214  type daemonTask struct {
   215  	task     task.DaemonTask
   216  	executor TaskExecutor
   217  	// activeRoutine is the go-routine runs in background to execute
   218  	// the daemon task.
   219  	activeRoutine atomic.Pointer[ActiveRoutine]
   220  }
   221  
   222  func (r *taskRunner) newDaemonTask(t task.DaemonTask) (*daemonTask, error) {
   223  	executor, err := r.getExecutor(t.Metadata.Executor)
   224  	if err != nil {
   225  		return nil, err
   226  	}
   227  	dt := &daemonTask{
   228  		task:     t,
   229  		executor: executor,
   230  	}
   231  	return dt, nil
   232  }
   233  
   234  func (r *taskRunner) startDaemonTaskWorker() error {
   235  	if err := r.stopper.RunNamedTask("poll-daemon-tasks", r.poll); err != nil {
   236  		return err
   237  	}
   238  	if err := r.stopper.RunNamedTask("handle-daemon-tasks", r.handleTask); err != nil {
   239  		return err
   240  	}
   241  	if err := r.stopper.RunNamedTask("daemon-tasks-heartbeat", r.sendHeartbeat); err != nil {
   242  		return err
   243  	}
   244  	return nil
   245  }
   246  
   247  func (r *taskRunner) poll(ctx context.Context) {
   248  	timer := time.NewTimer(r.options.fetchInterval)
   249  	defer timer.Stop()
   250  	for {
   251  		select {
   252  		case <-ctx.Done():
   253  			r.logger.Info("daemon task poll worker stopped")
   254  			return
   255  
   256  		case <-timer.C:
   257  			if taskFrameworkDisabled() {
   258  				continue
   259  			}
   260  			r.dispatchTaskHandle(ctx)
   261  			timer.Reset(r.options.fetchInterval)
   262  		}
   263  	}
   264  }
   265  
   266  func (r *taskRunner) enqueue(handler TaskHandler) {
   267  	r.pendingTaskHandle <- handler
   268  }
   269  
   270  func (r *taskRunner) newStartTask(t task.DaemonTask) {
   271  	dt, err := r.newDaemonTask(t)
   272  	if err != nil {
   273  		r.logger.Error("failed to dispatch daemon task",
   274  			zap.Uint64("task ID", t.ID), zap.Error(err))
   275  		return
   276  	}
   277  	r.enqueue(newStartTask(r, dt))
   278  }
   279  
   280  func (r *taskRunner) dispatchTaskHandle(ctx context.Context) {
   281  	r.daemonTasks.Lock()
   282  	defer r.daemonTasks.Unlock()
   283  	for _, t := range r.startTasks(ctx) {
   284  		r.newStartTask(t)
   285  	}
   286  	for _, t := range r.resumeTasks(ctx) {
   287  		dt, ok := r.daemonTasks.m[t.ID]
   288  		if ok {
   289  			r.enqueue(newResumeTask(r, dt))
   290  		} else {
   291  			r.newStartTask(t)
   292  		}
   293  	}
   294  	for _, t := range r.pauseTasks(ctx) {
   295  		dt, ok := r.daemonTasks.m[t.ID]
   296  		if ok {
   297  			r.enqueue(newPauseTask(r, dt))
   298  		} else {
   299  			dt, err := r.newDaemonTask(t)
   300  			if err != nil {
   301  				r.logger.Error("failed to dispatch daemon task",
   302  					zap.Uint64("task ID", t.ID), zap.Error(err))
   303  				return
   304  			}
   305  			r.enqueue(newPauseTask(r, dt))
   306  		}
   307  	}
   308  	for _, t := range r.cancelTasks(ctx) {
   309  		dt, ok := r.daemonTasks.m[t.ID]
   310  		if ok {
   311  			r.enqueue(newCancelTask(r, dt))
   312  		} else {
   313  			dt, err := r.newDaemonTask(t)
   314  			if err != nil {
   315  				r.logger.Error("failed to dispatch daemon task",
   316  					zap.Uint64("task ID", t.ID), zap.Error(err))
   317  				return
   318  			}
   319  			r.enqueue(newCancelTask(r, dt))
   320  		}
   321  	}
   322  }
   323  
   324  func (r *taskRunner) queryDaemonTasks(ctx context.Context, c ...Condition) []task.DaemonTask {
   325  	ctx, cancel := context.WithTimeout(ctx, r.options.fetchTimeout)
   326  	defer cancel()
   327  	t, err := r.service.QueryDaemonTask(ctx, c...)
   328  	if err != nil {
   329  		r.logger.Error("failed to get tasks", zap.Error(err))
   330  		return nil
   331  	}
   332  	return t
   333  }
   334  
   335  // mergeTasks merges all the tasks in all the slices. It not only remove the duplicated tasks,
   336  // but also filter out the tasks if the runner cannot run.
   337  func (r *taskRunner) mergeTasks(tasksSlice ...[]task.DaemonTask) []task.DaemonTask {
   338  	taskIDs := make(map[uint64]struct{})
   339  	var res []task.DaemonTask
   340  	for _, tasks := range tasksSlice {
   341  		for _, t := range tasks {
   342  			if _, ok := taskIDs[t.ID]; ok {
   343  				continue
   344  			}
   345  			if !r.canClaimDaemonTask(t.Account) {
   346  				continue
   347  			}
   348  			taskIDs[t.ID] = struct{}{}
   349  			res = append(res, t)
   350  		}
   351  	}
   352  	return res
   353  }
   354  
   355  // resumeTasks gets the tasks that need to start.
   356  // - status: task.TaskStatus_Created
   357  // - status: task.TaskStatus_Running AND last-heartbeat: timeout
   358  func (r *taskRunner) startTasks(ctx context.Context) []task.DaemonTask {
   359  	return r.mergeTasks(
   360  		r.queryDaemonTasks(ctx,
   361  			WithTaskStatusCond(task.TaskStatus_Created),
   362  		),
   363  		r.queryDaemonTasks(ctx,
   364  			WithTaskStatusCond(task.TaskStatus_Running, task.TaskStatus_ResumeRequested),
   365  			WithLastHeartbeat(LE, time.Now().UnixNano()-r.options.heartbeatTimeout.Nanoseconds()),
   366  		),
   367  	)
   368  }
   369  
   370  // resumeTasks gets the tasks that need to resume.
   371  // - status equals to task.TaskStatus_ResumeRequested and runner equals to local
   372  func (r *taskRunner) resumeTasks(ctx context.Context) []task.DaemonTask {
   373  	// We only resume the tasks that already running on this runner. For the tasks that
   374  	// run on other runners and heartbeat timeout, startTasks() will handle them.
   375  	return r.mergeTasks(
   376  		r.queryDaemonTasks(ctx,
   377  			WithTaskStatusCond(task.TaskStatus_ResumeRequested),
   378  			WithTaskRunnerCond(EQ, r.runnerID),
   379  		),
   380  	)
   381  }
   382  
   383  // pauseTasks gets the tasks that need to pause.
   384  // - status equals to task.TaskStatus_PauseRequested and runner equals to local
   385  func (r *taskRunner) pauseTasks(ctx context.Context) []task.DaemonTask {
   386  	// Handle the tasks which is in PauseRequested status:
   387  	//   1. the task is on current runner
   388  	//   2. the task is on other runners, but heartbeat timeout or null. In the handler,
   389  	//      do NOT pause the active routine in this case.
   390  	return r.mergeTasks(
   391  		r.queryDaemonTasks(ctx,
   392  			WithTaskStatusCond(task.TaskStatus_PauseRequested),
   393  			WithTaskRunnerCond(EQ, r.runnerID),
   394  		),
   395  		r.queryDaemonTasks(ctx,
   396  			WithTaskStatusCond(task.TaskStatus_PauseRequested),
   397  			WithLastHeartbeat(LE, time.Now().UnixNano()-r.options.heartbeatTimeout.Nanoseconds()),
   398  		),
   399  	)
   400  }
   401  
   402  // cancelTasks gets the tasks that need to cancel.
   403  func (r *taskRunner) cancelTasks(ctx context.Context) []task.DaemonTask {
   404  	// Handle the tasks which is in CancelRequested status:
   405  	//   1. the task is on current runner
   406  	//   2. the task is on other runners, but heartbeat timeout or null. In the handler,
   407  	//      do NOT cancel the active routine in this case.
   408  	return r.mergeTasks(
   409  		r.queryDaemonTasks(ctx,
   410  			WithTaskStatusCond(task.TaskStatus_CancelRequested),
   411  			WithTaskRunnerCond(EQ, r.runnerID),
   412  		),
   413  		r.queryDaemonTasks(ctx,
   414  			WithTaskStatusCond(task.TaskStatus_CancelRequested),
   415  			WithLastHeartbeat(LE, time.Now().UnixNano()-r.options.heartbeatTimeout.Nanoseconds()),
   416  		),
   417  	)
   418  }
   419  
   420  func (r *taskRunner) handleTask(ctx context.Context) {
   421  	for {
   422  		select {
   423  		case <-ctx.Done():
   424  			return
   425  		case h := <-r.pendingTaskHandle:
   426  			if err := h.Handle(ctx); err != nil {
   427  				r.logger.Error("failed to handle task", zap.Error(err))
   428  			}
   429  		}
   430  	}
   431  }
   432  
   433  func (r *taskRunner) sendHeartbeat(ctx context.Context) {
   434  	ticker := time.NewTicker(r.options.heartbeatInterval)
   435  	defer ticker.Stop()
   436  	for {
   437  		select {
   438  		case <-ctx.Done():
   439  			r.logger.Debug("heartbeat task stopped")
   440  			return
   441  		case <-ticker.C:
   442  			if taskFrameworkDisabled() {
   443  				continue
   444  			}
   445  			r.doSendHeartbeat(ctx)
   446  		}
   447  	}
   448  }
   449  
   450  func (r *taskRunner) doSendHeartbeat(ctx context.Context) {
   451  	r.daemonTasks.Lock()
   452  	tasks := make([]*daemonTask, 0, len(r.daemonTasks.m))
   453  	for _, dt := range r.daemonTasks.m {
   454  		tasks = append(tasks, dt)
   455  	}
   456  	r.daemonTasks.Unlock()
   457  
   458  	for _, dt := range tasks {
   459  		if err := r.service.HeartbeatDaemonTask(ctx, dt.task); err != nil {
   460  			r.logger.Error("task heartbeat failed",
   461  				zap.Uint64("task ID", dt.task.ID),
   462  				zap.Error(err))
   463  		}
   464  	}
   465  }
   466  
   467  func (r *taskRunner) startDaemonTask(ctx context.Context, dt *daemonTask) (bool, error) {
   468  	t := dt.task
   469  	t.TaskRunner = r.runnerID
   470  	t.TaskStatus = task.TaskStatus_Running
   471  	nowTime := time.Now()
   472  	t.UpdateAt = nowTime
   473  	t.LastRun = nowTime
   474  
   475  	// Update the last heartbeat if the daemon task is started successfully.
   476  	// The new value is used to prevent other runners to start this task at
   477  	// the same time.
   478  	t.LastHeartbeat = nowTime
   479  
   480  	// Clear the error message of the task when start it. And if it fails to
   481  	// start, new error message will be set again.
   482  	t.Details.Error = ""
   483  
   484  	// When update the daemon task, add the condition that last heartbeat of
   485  	// the task must be timeout or be null, which means that other runners does
   486  	// NOT try to start this task.
   487  	c, err := r.service.UpdateDaemonTask(ctx, []task.DaemonTask{t},
   488  		WithLastHeartbeat(LE, nowTime.UnixNano()-r.options.heartbeatTimeout.Nanoseconds()))
   489  	if err != nil {
   490  		return false, err
   491  	}
   492  
   493  	// The daemon task may be updated by other runners, so do not start the task on this runner.
   494  	if c != 1 {
   495  		return false, nil
   496  	}
   497  
   498  	r.addDaemonTask(dt)
   499  	return true, nil
   500  }
   501  
   502  func (r *taskRunner) setDaemonTaskError(ctx context.Context, dt *daemonTask, errMsg error) {
   503  	r.logger.Info("daemon task stopped with error", zap.Uint64("task ID", dt.task.ID),
   504  		zap.Error(errMsg))
   505  	t := dt.task
   506  	nowTime := time.Now()
   507  	t.UpdateAt = nowTime
   508  	t.Details.Error = errMsg.Error()
   509  	// TODO(volgariver6): if it is a retryable error, do not update the status,
   510  	// otherwise, set the status to Error.
   511  	_, err := r.service.UpdateDaemonTask(ctx, []task.DaemonTask{t})
   512  	if err != nil {
   513  		r.logger.Error("failed to set error message to task",
   514  			zap.Uint64("task ID", t.ID),
   515  			zap.String("error message", errMsg.Error()),
   516  			zap.Error(err))
   517  	}
   518  }
   519  
   520  func (r *taskRunner) addDaemonTask(dt *daemonTask) {
   521  	r.daemonTasks.Lock()
   522  	defer r.daemonTasks.Unlock()
   523  	if _, ok := r.daemonTasks.m[dt.task.ID]; ok {
   524  		return
   525  	}
   526  	r.daemonTasks.m[dt.task.ID] = dt
   527  }
   528  
   529  func (r *taskRunner) removeDaemonTask(id uint64) {
   530  	r.daemonTasks.Lock()
   531  	defer r.daemonTasks.Unlock()
   532  	delete(r.daemonTasks.m, id)
   533  }
   534  
   535  func (r *taskRunner) exists(id uint64) bool {
   536  	r.daemonTasks.Lock()
   537  	defer r.daemonTasks.Unlock()
   538  	if _, ok := r.daemonTasks.m[id]; ok {
   539  		return true
   540  	}
   541  	return false
   542  }