github.com/matrixorigin/matrixone@v1.2.0/pkg/taskservice/task_runner.go (about)

     1  // Copyright 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package taskservice
    16  
    17  import (
    18  	"context"
    19  	"runtime"
    20  	"sort"
    21  	"sync"
    22  	"sync/atomic"
    23  	"time"
    24  
    25  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    26  	"github.com/matrixorigin/matrixone/pkg/common/stopper"
    27  	"github.com/matrixorigin/matrixone/pkg/logutil"
    28  	"github.com/matrixorigin/matrixone/pkg/pb/task"
    29  	"go.uber.org/zap"
    30  )
    31  
    32  // RunnerOption option for create task runner
    33  type RunnerOption func(*taskRunner)
    34  
    35  // WithRunnerLogger set logger
    36  func WithRunnerLogger(logger *zap.Logger) RunnerOption {
    37  	return func(r *taskRunner) {
    38  		r.logger = logger
    39  	}
    40  }
    41  
    42  // WithRunnerFetchLimit set fetch tasks limit
    43  func WithRunnerFetchLimit(limit int) RunnerOption {
    44  	return func(r *taskRunner) {
    45  		r.options.queryLimit = limit
    46  	}
    47  }
    48  
    49  // WithRunnerParallelism set the parallelism for execute tasks.
    50  func WithRunnerParallelism(parallelism int) RunnerOption {
    51  	return func(r *taskRunner) {
    52  		r.options.parallelism = parallelism
    53  	}
    54  }
    55  
    56  // WithRunnerMaxWaitTasks set the maximum number of tasks waiting to be executed, more than that
    57  // will block fetching tasks.
    58  func WithRunnerMaxWaitTasks(maxWaitTasks int) RunnerOption {
    59  	return func(r *taskRunner) {
    60  		r.options.maxWaitTasks = maxWaitTasks
    61  	}
    62  }
    63  
    64  // WithRunnerFetchInterval set fetch tasks interval duration
    65  func WithRunnerFetchInterval(interval time.Duration) RunnerOption {
    66  	return func(r *taskRunner) {
    67  		r.options.fetchInterval = interval
    68  	}
    69  }
    70  
    71  // WithRunnerFetchTimeout set fetch timeout
    72  func WithRunnerFetchTimeout(timeout time.Duration) RunnerOption {
    73  	return func(r *taskRunner) {
    74  		r.options.fetchTimeout = timeout
    75  	}
    76  }
    77  
    78  // WithRunnerHeartbeatInterval set heartbeat duration
    79  func WithRunnerHeartbeatInterval(interval time.Duration) RunnerOption {
    80  	return func(r *taskRunner) {
    81  		r.options.heartbeatInterval = interval
    82  	}
    83  }
    84  
    85  // WithRunnerHeartbeatTimeout set heartbeat timeout.
    86  func WithRunnerHeartbeatTimeout(timeout time.Duration) RunnerOption {
    87  	return func(r *taskRunner) {
    88  		r.options.heartbeatTimeout = timeout
    89  	}
    90  }
    91  
    92  // WithOptions set all options needed by taskRunner
    93  func WithOptions(
    94  	queryLimit int,
    95  	parallelism int,
    96  	maxWaitTasks int,
    97  	fetchInterval time.Duration,
    98  	fetchTimeout time.Duration,
    99  	retryInterval time.Duration,
   100  	heartbeatInterval time.Duration,
   101  	heartbeatTimeout time.Duration,
   102  ) RunnerOption {
   103  	return func(r *taskRunner) {
   104  		r.options.queryLimit = queryLimit
   105  		r.options.parallelism = parallelism
   106  		r.options.maxWaitTasks = maxWaitTasks
   107  		r.options.fetchInterval = fetchInterval
   108  		r.options.fetchTimeout = fetchTimeout
   109  		r.options.retryInterval = retryInterval
   110  		r.options.heartbeatInterval = heartbeatInterval
   111  		r.options.heartbeatTimeout = heartbeatTimeout
   112  	}
   113  }
   114  
   115  // WithRunnerRetryInterval set retry interval duration for operation
   116  func WithRunnerRetryInterval(interval time.Duration) RunnerOption {
   117  	return func(r *taskRunner) {
   118  		r.options.retryInterval = interval
   119  	}
   120  }
   121  
   122  type taskRunner struct {
   123  	logger       *zap.Logger
   124  	runnerID     string
   125  	service      TaskService
   126  	stopper      stopper.Stopper
   127  	waitTasksC   chan runningTask
   128  	parallelismC chan struct{}
   129  	doneC        chan runningTask
   130  
   131  	started atomic.Bool
   132  
   133  	executors struct {
   134  		sync.RWMutex
   135  		m map[task.TaskCode]TaskExecutor
   136  	}
   137  
   138  	runningTasks struct {
   139  		sync.RWMutex
   140  		m map[uint64]runningTask
   141  
   142  		completedTasks map[uint64]struct{}
   143  	}
   144  
   145  	retryTasks struct {
   146  		sync.Mutex
   147  		s []runningTask
   148  	}
   149  
   150  	// accountID indicates the runner belongs to the account.
   151  	canClaimDaemonTask func(string) bool
   152  
   153  	pendingTaskHandle chan TaskHandler
   154  	// daemonTasks contains all daemon tasks that run on this node.
   155  	daemonTasks struct {
   156  		sync.Mutex
   157  		m map[uint64]*daemonTask
   158  	}
   159  
   160  	options struct {
   161  		queryLimit        int
   162  		parallelism       int
   163  		maxWaitTasks      int
   164  		fetchInterval     time.Duration
   165  		fetchTimeout      time.Duration
   166  		retryInterval     time.Duration
   167  		heartbeatInterval time.Duration
   168  		heartbeatTimeout  time.Duration
   169  	}
   170  }
   171  
   172  // NewTaskRunner new task runner. The TaskRunner can be created by CN nodes and pull tasks from TaskService to
   173  // execute periodically.
   174  func NewTaskRunner(runnerID string, service TaskService, claimFn func(string) bool, opts ...RunnerOption) TaskRunner {
   175  	r := &taskRunner{
   176  		runnerID: runnerID,
   177  		service:  service,
   178  		// set the claim checker function for daemon task.
   179  		canClaimDaemonTask: claimFn,
   180  	}
   181  	r.executors.m = make(map[task.TaskCode]TaskExecutor)
   182  	for _, opt := range opts {
   183  		opt(r)
   184  	}
   185  	r.adjust()
   186  
   187  	r.logger = logutil.Adjust(r.logger).Named("task-runner").With(zap.String("runner-id", r.runnerID))
   188  	r.stopper = *stopper.NewStopper("task-runner", stopper.WithLogger(r.logger))
   189  	r.parallelismC = make(chan struct{}, r.options.parallelism)
   190  	r.waitTasksC = make(chan runningTask, r.options.maxWaitTasks)
   191  	r.doneC = make(chan runningTask, r.options.maxWaitTasks)
   192  	r.runningTasks.m = make(map[uint64]runningTask)
   193  	r.runningTasks.completedTasks = make(map[uint64]struct{})
   194  	r.pendingTaskHandle = make(chan TaskHandler, 20)
   195  	r.daemonTasks.m = make(map[uint64]*daemonTask)
   196  	return r
   197  }
   198  
   199  func (r *taskRunner) adjust() {
   200  	if r.options.parallelism == 0 {
   201  		r.options.parallelism = runtime.NumCPU() / 4
   202  		if r.options.parallelism == 0 {
   203  			r.options.parallelism = 1
   204  		}
   205  	}
   206  	if r.options.fetchInterval == 0 {
   207  		r.options.fetchInterval = time.Second * 10
   208  	}
   209  	if r.options.fetchTimeout == 0 {
   210  		r.options.fetchTimeout = time.Second * 10
   211  	}
   212  	if r.options.heartbeatInterval == 0 {
   213  		r.options.heartbeatInterval = time.Second * 5
   214  	}
   215  	if r.options.heartbeatTimeout == 0 {
   216  		r.options.heartbeatTimeout = time.Second * 30
   217  	}
   218  	if r.options.maxWaitTasks == 0 {
   219  		r.options.maxWaitTasks = 256
   220  	}
   221  	if r.options.queryLimit == 0 {
   222  		r.options.queryLimit = r.options.parallelism
   223  	}
   224  	if r.options.retryInterval == 0 {
   225  		r.options.retryInterval = time.Second
   226  	}
   227  }
   228  
   229  func (r *taskRunner) ID() string {
   230  	return r.runnerID
   231  }
   232  
   233  func (r *taskRunner) Start() error {
   234  	if !r.started.CompareAndSwap(false, true) {
   235  		return nil
   236  	}
   237  	if err := r.startAsyncTaskWorker(); err != nil {
   238  		return err
   239  	}
   240  	if err := r.startDaemonTaskWorker(); err != nil {
   241  		return err
   242  	}
   243  	return nil
   244  }
   245  
   246  func (r *taskRunner) startAsyncTaskWorker() error {
   247  	if err := r.stopper.RunNamedTask("fetch-task", r.fetch); err != nil {
   248  		return err
   249  	}
   250  	if err := r.stopper.RunNamedTask("dispatch-task", r.dispatch); err != nil {
   251  		return err
   252  	}
   253  	if err := r.stopper.RunNamedTask("done-task", r.done); err != nil {
   254  		return err
   255  	}
   256  	if err := r.stopper.RunNamedTask("heartbeat-task", r.heartbeat); err != nil {
   257  		return err
   258  	}
   259  	if err := r.stopper.RunNamedTask("retry-task", r.retry); err != nil {
   260  		return err
   261  	}
   262  	return nil
   263  }
   264  
   265  func (r *taskRunner) Stop() error {
   266  	if !r.started.CompareAndSwap(true, false) {
   267  		return nil
   268  	}
   269  
   270  	r.stopper.Stop()
   271  	close(r.waitTasksC)
   272  	close(r.parallelismC)
   273  	close(r.doneC)
   274  	return nil
   275  }
   276  
   277  func (r *taskRunner) Parallelism() int {
   278  	return r.options.parallelism
   279  }
   280  
   281  func (r *taskRunner) RegisterExecutor(code task.TaskCode, executor TaskExecutor) {
   282  	r.executors.Lock()
   283  	defer r.executors.Unlock()
   284  
   285  	if _, ok := r.executors.m[code]; !ok {
   286  		r.logger.Debug("executor registered", zap.Any("code", code))
   287  		r.executors.m[code] = executor
   288  	}
   289  }
   290  
   291  func (r *taskRunner) GetExecutor(code task.TaskCode) TaskExecutor {
   292  	r.executors.RLock()
   293  	defer r.executors.RUnlock()
   294  
   295  	if executor, ok := r.executors.m[code]; ok {
   296  		return executor
   297  	}
   298  
   299  	return nil
   300  }
   301  
   302  func (r *taskRunner) Attach(ctx context.Context, taskID uint64, routine ActiveRoutine) error {
   303  	r.daemonTasks.Lock()
   304  	defer r.daemonTasks.Unlock()
   305  	t, ok := r.daemonTasks.m[taskID]
   306  	if !ok {
   307  		return moerr.NewErrTaskNotFound(ctx, taskID)
   308  	}
   309  	t.activeRoutine.Store(&routine)
   310  	return nil
   311  }
   312  
   313  func (r *taskRunner) fetch(ctx context.Context) {
   314  	r.logger.Debug("fetch task started")
   315  	ticker := time.NewTicker(r.options.fetchInterval)
   316  	defer ticker.Stop()
   317  
   318  	for {
   319  		select {
   320  		case <-ctx.Done():
   321  			r.logger.Debug("fetch task stopped")
   322  			return
   323  		case <-ticker.C:
   324  			if taskFrameworkDisabled() {
   325  				continue
   326  			}
   327  			tasks, err := r.doFetch()
   328  			if err != nil {
   329  				r.logger.Error("fetch task failed", zap.Error(err))
   330  				break
   331  			}
   332  			for _, t := range tasks {
   333  				r.addToWait(ctx, t)
   334  			}
   335  		}
   336  	}
   337  }
   338  
   339  func (r *taskRunner) doFetch() ([]task.AsyncTask, error) {
   340  	ctx, cancel := context.WithTimeout(context.Background(), r.options.fetchTimeout)
   341  	tasks, err := r.service.QueryAsyncTask(ctx,
   342  		WithTaskStatusCond(task.TaskStatus_Running),
   343  		WithLimitCond(r.options.queryLimit),
   344  		WithTaskRunnerCond(EQ, r.runnerID))
   345  	cancel()
   346  	if err != nil {
   347  		return nil, err
   348  	}
   349  	newTasks := tasks[:0]
   350  	r.runningTasks.Lock()
   351  	for _, t := range tasks {
   352  		if _, ok := r.runningTasks.m[t.ID]; !ok {
   353  			if _, ok := r.runningTasks.completedTasks[t.ID]; !ok {
   354  				r.logger.Info("new task fetched",
   355  					zap.String("task", t.DebugString()))
   356  				newTasks = append(newTasks, t)
   357  			}
   358  		}
   359  	}
   360  	for k := range r.runningTasks.completedTasks {
   361  		delete(r.runningTasks.completedTasks, k)
   362  	}
   363  	r.runningTasks.Unlock()
   364  
   365  	if len(newTasks) == 0 {
   366  		return nil, nil
   367  	}
   368  
   369  	return newTasks, nil
   370  }
   371  
   372  func (r *taskRunner) addToWait(ctx context.Context, task task.AsyncTask) bool {
   373  	ctx2, cancel := context.WithCancel(ctx)
   374  	rt := runningTask{
   375  		task:   task,
   376  		ctx:    ctx2,
   377  		cancel: cancel,
   378  	}
   379  
   380  	select {
   381  	case <-ctx.Done():
   382  		return false
   383  	case r.waitTasksC <- rt:
   384  		r.runningTasks.Lock()
   385  		r.runningTasks.m[task.ID] = rt
   386  		r.runningTasks.Unlock()
   387  		r.logger.Info("task added to wait queue",
   388  			zap.String("task", task.DebugString()))
   389  		return true
   390  	}
   391  }
   392  
   393  func (r *taskRunner) dispatch(ctx context.Context) {
   394  	r.logger.Debug("dispatch task started")
   395  
   396  	for {
   397  		select {
   398  		case <-ctx.Done():
   399  			r.logger.Debug("dispatch task stopped")
   400  			return
   401  		case rt := <-r.waitTasksC:
   402  			if taskFrameworkDisabled() {
   403  				continue
   404  			}
   405  			r.runTask(ctx, rt)
   406  		}
   407  	}
   408  }
   409  
   410  func (r *taskRunner) retry(ctx context.Context) {
   411  	r.logger.Debug("retry task started")
   412  	ticker := time.NewTicker(100 * time.Millisecond)
   413  	defer ticker.Stop()
   414  
   415  	var needRetryTasks []runningTask
   416  	for {
   417  		select {
   418  		case <-ctx.Done():
   419  			r.logger.Debug("retry task stopped")
   420  			return
   421  		case <-ticker.C:
   422  			if taskFrameworkDisabled() {
   423  				continue
   424  			}
   425  			needRetryTasks = needRetryTasks[:0]
   426  			r.retryTasks.Lock()
   427  			for i, rt := range r.retryTasks.s {
   428  				if rt.retryAt.After(time.Now()) {
   429  					r.retryTasks.s = r.retryTasks.s[:copy(r.retryTasks.s, r.retryTasks.s[i:])]
   430  					break
   431  				}
   432  				needRetryTasks = append(needRetryTasks, rt)
   433  			}
   434  			r.retryTasks.Unlock()
   435  			for _, rt := range needRetryTasks {
   436  				r.runTask(ctx, rt)
   437  			}
   438  		}
   439  	}
   440  }
   441  
   442  func (r *taskRunner) runTask(ctx context.Context, rt runningTask) {
   443  	select {
   444  	case <-ctx.Done():
   445  	case r.parallelismC <- struct{}{}:
   446  		r.run(rt)
   447  	}
   448  }
   449  
   450  func (r *taskRunner) run(rt runningTask) {
   451  	err := r.stopper.RunTask(func(ctx context.Context) {
   452  		start := time.Now()
   453  		r.logger.Debug("task start execute",
   454  			zap.String("task", rt.task.DebugString()))
   455  		defer func() {
   456  			r.logger.Debug("task execute completed",
   457  				zap.String("task", rt.task.DebugString()),
   458  				zap.Duration("cost", time.Since(start)))
   459  		}()
   460  
   461  		if executor, err := r.getExecutor(rt.task.Metadata.Executor); err != nil {
   462  			r.taskExecResult(rt, err, false)
   463  		} else if err := executor(rt.ctx, &rt.task); err != nil {
   464  			r.taskExecResult(rt, err, true)
   465  		} else {
   466  			r.taskExecResult(rt, nil, false)
   467  		}
   468  	})
   469  	if err != nil {
   470  		r.logger.Error("run task failed", zap.Error(err))
   471  	}
   472  }
   473  
   474  func (r *taskRunner) taskExecResult(rt runningTask, err error, mayRetry bool) {
   475  	if err == nil {
   476  		rt.task.ExecuteResult = &task.ExecuteResult{
   477  			Code: task.ResultCode_Success,
   478  		}
   479  	} else {
   480  		r.logger.Error("run task failed",
   481  			zap.String("task", rt.task.DebugString()),
   482  			zap.Error(err))
   483  		rt.task.ExecuteResult = &task.ExecuteResult{
   484  			Code:  task.ResultCode_Failed,
   485  			Error: err.Error(),
   486  		}
   487  	}
   488  
   489  	if mayRetry && rt.canRetry() {
   490  		rt.retryTimes++
   491  		rt.retryAt = time.Now().Add(time.Duration(rt.task.Metadata.Options.RetryInterval))
   492  		if !r.addRetryTask(rt) {
   493  			// retry queue is full, let scheduler re-allocate.
   494  			r.removeRunningTask(rt.task.ID)
   495  			r.releaseParallel()
   496  		}
   497  		return
   498  	}
   499  	r.addDoneTask(rt)
   500  }
   501  
   502  func (r *taskRunner) addDoneTask(rt runningTask) {
   503  	r.releaseParallel()
   504  	r.doneC <- rt
   505  }
   506  
   507  func (r *taskRunner) addRetryTask(task runningTask) bool {
   508  	r.retryTasks.Lock()
   509  	defer r.retryTasks.Unlock()
   510  	if len(r.retryTasks.s) >= r.options.maxWaitTasks {
   511  		return false
   512  	}
   513  
   514  	r.retryTasks.s = append(r.retryTasks.s, task)
   515  	sort.Slice(r.retryTasks.s, func(i, j int) bool {
   516  		return r.retryTasks.s[i].retryAt.Before(r.retryTasks.s[j].retryAt)
   517  	})
   518  	return true
   519  }
   520  
   521  func (r *taskRunner) releaseParallel() {
   522  	// other task can execute
   523  	select {
   524  	case <-r.parallelismC:
   525  	default:
   526  		panic("BUG")
   527  	}
   528  }
   529  
   530  func (r *taskRunner) done(ctx context.Context) {
   531  	r.logger.Debug("done task started")
   532  
   533  	for {
   534  		select {
   535  		case <-ctx.Done():
   536  			r.logger.Debug("done task stopped")
   537  			return
   538  		case rt := <-r.doneC:
   539  			if taskFrameworkDisabled() {
   540  				continue
   541  			}
   542  			r.doTaskDone(ctx, rt)
   543  		}
   544  	}
   545  }
   546  
   547  func (r *taskRunner) doTaskDone(ctx context.Context, rt runningTask) bool {
   548  	for {
   549  		select {
   550  		case <-ctx.Done():
   551  			return false
   552  		case <-rt.ctx.Done():
   553  			return false
   554  		default:
   555  			err := r.service.Complete(rt.ctx, r.runnerID, rt.task, *rt.task.ExecuteResult)
   556  			if err == nil || moerr.IsMoErrCode(err, moerr.ErrInvalidTask) {
   557  				r.removeRunningTask(rt.task.ID)
   558  				r.logger.Info("task completed",
   559  					zap.String("task", rt.task.DebugString()),
   560  					zap.Error(err))
   561  				return true
   562  			}
   563  
   564  			r.logger.Error("task done failed, retry later",
   565  				zap.String("task", rt.task.DebugString()),
   566  				zap.Error(err))
   567  			time.Sleep(r.options.retryInterval)
   568  		}
   569  	}
   570  }
   571  
   572  func (r *taskRunner) heartbeat(ctx context.Context) {
   573  	r.logger.Debug("heartbeat task started")
   574  	ticker := time.NewTicker(r.options.heartbeatInterval)
   575  	defer ticker.Stop()
   576  
   577  	for {
   578  		select {
   579  		case <-ctx.Done():
   580  			r.logger.Debug("heartbeat task stopped")
   581  			return
   582  		case <-ticker.C:
   583  			if taskFrameworkDisabled() {
   584  				continue
   585  			}
   586  			r.doHeartbeat(ctx)
   587  		}
   588  	}
   589  }
   590  
   591  func (r *taskRunner) doHeartbeat(ctx context.Context) {
   592  	r.runningTasks.RLock()
   593  	tasks := make([]runningTask, 0, len(r.runningTasks.m))
   594  	for _, rt := range r.runningTasks.m {
   595  		tasks = append(tasks, rt)
   596  	}
   597  	r.runningTasks.RUnlock()
   598  
   599  	for _, rt := range tasks {
   600  		if err := r.service.Heartbeat(ctx, rt.task); err != nil {
   601  			if moerr.IsMoErrCode(err, moerr.ErrInvalidTask) {
   602  				r.removeRunningTask(rt.task.ID)
   603  				rt.cancel()
   604  			}
   605  			r.logger.Error("task heartbeat failed",
   606  				zap.String("task", rt.task.DebugString()),
   607  				zap.Error(err))
   608  		}
   609  	}
   610  }
   611  
   612  func (r *taskRunner) removeRunningTask(id uint64) {
   613  	r.runningTasks.Lock()
   614  	defer r.runningTasks.Unlock()
   615  	delete(r.runningTasks.m, id)
   616  	r.runningTasks.completedTasks[id] = struct{}{}
   617  	r.logger.Info("task removed", zap.Uint64("task-id", id))
   618  }
   619  
   620  func (r *taskRunner) getExecutor(code task.TaskCode) (TaskExecutor, error) {
   621  	r.executors.RLock()
   622  	defer r.executors.RUnlock()
   623  
   624  	if executor, ok := r.executors.m[code]; ok {
   625  		return executor, nil
   626  	}
   627  	return nil, moerr.NewInternalErrorNoCtx("executor with code %d not exists", code)
   628  }
   629  
   630  type runningTask struct {
   631  	task       task.AsyncTask
   632  	ctx        context.Context
   633  	cancel     context.CancelFunc
   634  	retryTimes uint32
   635  	retryAt    time.Time
   636  }
   637  
   638  func (rt runningTask) canRetry() bool {
   639  	return rt.retryTimes < rt.task.Metadata.Options.MaxRetryTimes
   640  }