github.com/matrixorigin/matrixone@v0.7.0/pkg/taskservice/task_runner.go (about)

     1  // Copyright 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package taskservice
    16  
    17  import (
    18  	"context"
    19  	"runtime"
    20  	"sort"
    21  	"sync"
    22  	"time"
    23  
    24  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    25  	"github.com/matrixorigin/matrixone/pkg/common/stopper"
    26  	"github.com/matrixorigin/matrixone/pkg/logutil"
    27  	"github.com/matrixorigin/matrixone/pkg/pb/task"
    28  	"go.uber.org/zap"
    29  )
    30  
    31  // RunnerOption option for create task runner
    32  type RunnerOption func(*taskRunner)
    33  
    34  // WithRunnerLogger set logger
    35  func WithRunnerLogger(logger *zap.Logger) RunnerOption {
    36  	return func(r *taskRunner) {
    37  		r.logger = logger
    38  	}
    39  }
    40  
    41  // WithRunnerFetchLimit set fetch tasks limit
    42  func WithRunnerFetchLimit(limit int) RunnerOption {
    43  	return func(r *taskRunner) {
    44  		r.options.queryLimit = limit
    45  	}
    46  }
    47  
    48  // WithRunnerParallelism set the parallelism for execute tasks.
    49  func WithRunnerParallelism(parallelism int) RunnerOption {
    50  	return func(r *taskRunner) {
    51  		r.options.parallelism = parallelism
    52  	}
    53  }
    54  
    55  // WithRunnerMaxWaitTasks set the maximum number of tasks waiting to be executed, more than that
    56  // will block fetching tasks.
    57  func WithRunnerMaxWaitTasks(maxWaitTasks int) RunnerOption {
    58  	return func(r *taskRunner) {
    59  		r.options.maxWaitTasks = maxWaitTasks
    60  	}
    61  }
    62  
    63  // WithRunnerFetchInterval set fetch tasks interval duration
    64  func WithRunnerFetchInterval(interval time.Duration) RunnerOption {
    65  	return func(r *taskRunner) {
    66  		r.options.fetchInterval = interval
    67  	}
    68  }
    69  
    70  // WithRunnerFetchTimeout set fetch timeout
    71  func WithRunnerFetchTimeout(timeout time.Duration) RunnerOption {
    72  	return func(r *taskRunner) {
    73  		r.options.fetchTimeout = timeout
    74  	}
    75  }
    76  
    77  // WithRunnerHeartbeatInterval set heartbeat duration
    78  func WithRunnerHeartbeatInterval(interval time.Duration) RunnerOption {
    79  	return func(r *taskRunner) {
    80  		r.options.heartbeatInterval = interval
    81  	}
    82  }
    83  
    84  // WithOptions set all options needed by taskRunner
    85  func WithOptions(
    86  	queryLimit int,
    87  	parallelism int,
    88  	maxWaitTasks int,
    89  	fetchInterval time.Duration,
    90  	fetchTimeout time.Duration,
    91  	retryInterval time.Duration,
    92  	heartbeatInterval time.Duration,
    93  ) RunnerOption {
    94  	return func(r *taskRunner) {
    95  		r.options.queryLimit = queryLimit
    96  		r.options.parallelism = parallelism
    97  		r.options.maxWaitTasks = maxWaitTasks
    98  		r.options.fetchInterval = fetchInterval
    99  		r.options.fetchTimeout = fetchTimeout
   100  		r.options.retryInterval = retryInterval
   101  		r.options.heartbeatInterval = heartbeatInterval
   102  	}
   103  }
   104  
   105  // WithRunnerRetryInterval set retry interval duration for operation
   106  func WithRunnerRetryInterval(interval time.Duration) RunnerOption {
   107  	return func(r *taskRunner) {
   108  		r.options.retryInterval = interval
   109  	}
   110  }
   111  
   112  type taskRunner struct {
   113  	logger       *zap.Logger
   114  	runnerID     string
   115  	service      TaskService
   116  	stopper      stopper.Stopper
   117  	lastTaskID   uint64
   118  	waitTasksC   chan task.Task
   119  	parallelismC chan struct{}
   120  	doneC        chan runningTask
   121  
   122  	mu struct {
   123  		sync.RWMutex
   124  		started      bool
   125  		executors    map[task.TaskCode]TaskExecutor
   126  		runningTasks map[uint64]runningTask
   127  		retryTasks   []runningTask
   128  	}
   129  
   130  	options struct {
   131  		queryLimit        int
   132  		parallelism       int
   133  		maxWaitTasks      int
   134  		fetchInterval     time.Duration
   135  		fetchTimeout      time.Duration
   136  		retryInterval     time.Duration
   137  		heartbeatInterval time.Duration
   138  	}
   139  }
   140  
   141  // NewTaskRunner new task runner. The TaskRunner can be created by CN nodes and pull tasks from TaskService to
   142  // execute periodically.
   143  func NewTaskRunner(runnerID string, service TaskService, opts ...RunnerOption) TaskRunner {
   144  	r := &taskRunner{
   145  		runnerID: runnerID,
   146  		service:  service,
   147  	}
   148  	r.mu.executors = make(map[task.TaskCode]TaskExecutor)
   149  	for _, opt := range opts {
   150  		opt(r)
   151  	}
   152  	r.adjust()
   153  
   154  	r.logger = logutil.Adjust(r.logger).Named("task-runner").With(zap.String("runner-id", r.runnerID))
   155  	r.stopper = *stopper.NewStopper("task-runner", stopper.WithLogger(r.logger))
   156  	r.parallelismC = make(chan struct{}, r.options.parallelism)
   157  	r.waitTasksC = make(chan task.Task, r.options.maxWaitTasks)
   158  	r.doneC = make(chan runningTask, r.options.maxWaitTasks)
   159  	r.mu.runningTasks = make(map[uint64]runningTask)
   160  	return r
   161  }
   162  
   163  func (r *taskRunner) adjust() {
   164  	if r.options.parallelism == 0 {
   165  		r.options.parallelism = runtime.NumCPU() / 16
   166  		if r.options.parallelism == 0 {
   167  			r.options.parallelism = 1
   168  		}
   169  	}
   170  	if r.options.fetchInterval == 0 {
   171  		r.options.fetchInterval = time.Second * 10
   172  	}
   173  	if r.options.fetchTimeout == 0 {
   174  		r.options.fetchTimeout = time.Second * 5
   175  	}
   176  	if r.options.heartbeatInterval == 0 {
   177  		r.options.heartbeatInterval = time.Second * 5
   178  	}
   179  	if r.options.maxWaitTasks == 0 {
   180  		r.options.maxWaitTasks = 256
   181  	}
   182  	if r.options.queryLimit == 0 {
   183  		r.options.queryLimit = r.options.parallelism
   184  	}
   185  	if r.options.retryInterval == 0 {
   186  		r.options.retryInterval = time.Second
   187  	}
   188  }
   189  
   190  func (r *taskRunner) ID() string {
   191  	return r.runnerID
   192  }
   193  
   194  func (r *taskRunner) Start() error {
   195  	r.mu.Lock()
   196  	defer r.mu.Unlock()
   197  
   198  	if r.mu.started {
   199  		return nil
   200  	}
   201  
   202  	r.mu.started = true
   203  
   204  	if err := r.stopper.RunNamedTask("fetch-task", r.fetch); err != nil {
   205  		return err
   206  	}
   207  	if err := r.stopper.RunNamedTask("dispatch-task", r.dispatch); err != nil {
   208  		return err
   209  	}
   210  	if err := r.stopper.RunNamedTask("done-task", r.done); err != nil {
   211  		return err
   212  	}
   213  	if err := r.stopper.RunNamedTask("heartbeat-task", r.heartbeat); err != nil {
   214  		return err
   215  	}
   216  	if err := r.stopper.RunNamedTask("retry-task", r.retry); err != nil {
   217  		return err
   218  	}
   219  	return nil
   220  }
   221  
   222  func (r *taskRunner) Stop() error {
   223  	r.mu.Lock()
   224  	if !r.mu.started {
   225  		r.mu.Unlock()
   226  		return nil
   227  	}
   228  	r.mu.started = false
   229  	r.mu.Unlock()
   230  
   231  	r.stopper.Stop()
   232  	close(r.waitTasksC)
   233  	close(r.parallelismC)
   234  	close(r.doneC)
   235  	return nil
   236  }
   237  
   238  func (r *taskRunner) Parallelism() int {
   239  	return r.options.parallelism
   240  }
   241  
   242  func (r *taskRunner) RegisterExecutor(code task.TaskCode, executor TaskExecutor) {
   243  	r.mu.Lock()
   244  	defer r.mu.Unlock()
   245  
   246  	if _, ok := r.mu.executors[code]; !ok {
   247  		r.logger.Debug("executor registered", zap.Any("code", code))
   248  		r.mu.executors[code] = executor
   249  	}
   250  }
   251  
   252  func (r *taskRunner) fetch(ctx context.Context) {
   253  	r.logger.Info("fetch task started")
   254  	timer := time.NewTimer(r.options.fetchInterval)
   255  	defer timer.Stop()
   256  
   257  	for {
   258  		select {
   259  		case <-ctx.Done():
   260  			r.logger.Info("fetch task stopped")
   261  			return
   262  		case <-timer.C:
   263  			if !taskFrameworkDisabled() {
   264  				tasks, err := r.doFetch()
   265  				if err != nil {
   266  					break
   267  				}
   268  				r.addTasks(ctx, tasks)
   269  			}
   270  		}
   271  		timer.Reset(r.options.fetchInterval)
   272  	}
   273  }
   274  
   275  func (r *taskRunner) doFetch() ([]task.Task, error) {
   276  	ctx, cancel := context.WithTimeout(context.Background(), r.options.fetchTimeout)
   277  	tasks, err := r.service.QueryTask(ctx,
   278  		WithTaskIDCond(GT, r.lastTaskID),
   279  		WithLimitCond(r.options.queryLimit),
   280  		WithTaskRunnerCond(EQ, r.runnerID))
   281  	cancel()
   282  	if err != nil {
   283  		r.logger.Error("fetch task failed", zap.Error(err))
   284  		return nil, err
   285  	}
   286  	if len(tasks) == 0 {
   287  		return nil, nil
   288  	}
   289  
   290  	r.lastTaskID = tasks[len(tasks)-1].ID
   291  	r.logger.Debug("new task fetched",
   292  		zap.Int("count", len(tasks)),
   293  		zap.Uint64("last-task-id", r.lastTaskID))
   294  	return tasks, nil
   295  }
   296  
   297  func (r *taskRunner) addTasks(ctx context.Context, tasks []task.Task) {
   298  	for _, task := range tasks {
   299  		r.addToWait(ctx, task)
   300  	}
   301  }
   302  
   303  func (r *taskRunner) addToWait(ctx context.Context, task task.Task) bool {
   304  	select {
   305  	case <-ctx.Done():
   306  		return false
   307  	case r.waitTasksC <- task:
   308  		r.logger.Debug("task added", zap.String("task", task.DebugString()))
   309  		return true
   310  	}
   311  }
   312  
   313  func (r *taskRunner) dispatch(ctx context.Context) {
   314  	r.logger.Info("dispatch task started")
   315  
   316  	for {
   317  		select {
   318  		case <-ctx.Done():
   319  			r.logger.Info("dispatch task stopped")
   320  			return
   321  		case task := <-r.waitTasksC:
   322  			if !taskFrameworkDisabled() {
   323  				r.runTask(ctx, task)
   324  			}
   325  		}
   326  	}
   327  }
   328  
   329  func (r *taskRunner) retry(ctx context.Context) {
   330  	r.logger.Info("retry task started")
   331  	timer := time.NewTimer(time.Second)
   332  	defer timer.Stop()
   333  
   334  	var needRetryTasks []runningTask
   335  	for {
   336  		select {
   337  		case <-ctx.Done():
   338  			r.logger.Info("retry task stopped")
   339  			return
   340  		case <-timer.C:
   341  			if !taskFrameworkDisabled() {
   342  				now := time.Now()
   343  				needRetryTasks = needRetryTasks[:0]
   344  				r.mu.Lock()
   345  				for idx, rt := range r.mu.retryTasks {
   346  					if rt.retryAt.After(now) {
   347  						r.mu.retryTasks = r.mu.retryTasks[:copy(r.mu.retryTasks, r.mu.retryTasks[idx:])]
   348  						break
   349  					}
   350  					needRetryTasks = append(needRetryTasks, rt)
   351  				}
   352  				r.mu.Unlock()
   353  				if len(needRetryTasks) > 0 {
   354  					for _, rt := range needRetryTasks {
   355  						r.runTask(ctx, rt)
   356  					}
   357  				}
   358  			}
   359  		}
   360  		timer.Reset(time.Millisecond * 100)
   361  	}
   362  }
   363  
   364  func (r *taskRunner) runTask(ctx context.Context, value any) bool {
   365  	select {
   366  	case <-ctx.Done():
   367  		return false
   368  	case r.parallelismC <- struct{}{}:
   369  		var rt runningTask
   370  		switch value := value.(type) {
   371  		case task.Task:
   372  			rt = runningTask{task: value}
   373  			rt.ctx, rt.cancel = context.WithCancel(ctx)
   374  			r.mu.Lock()
   375  			r.mu.runningTasks[rt.task.ID] = rt
   376  			r.mu.Unlock()
   377  		case runningTask:
   378  			rt = value
   379  		}
   380  
   381  		r.run(rt)
   382  		return true
   383  	}
   384  }
   385  
   386  func (r *taskRunner) run(rt runningTask) {
   387  	err := r.stopper.RunTask(func(ctx context.Context) {
   388  		start := time.Now()
   389  		r.logger.Debug("task start execute",
   390  			zap.String("task", rt.task.DebugString()))
   391  		defer r.logger.Debug("task execute completed",
   392  			zap.String("task", rt.task.DebugString()),
   393  			zap.Duration("cost", time.Since(start)))
   394  
   395  		executor, err := r.getExecutor(rt.task.Metadata.Executor)
   396  		result := &task.ExecuteResult{Code: task.ResultCode_Success}
   397  		if err == nil {
   398  			if err = executor(rt.ctx, rt.task); err == nil {
   399  				goto taskDone
   400  			}
   401  		}
   402  
   403  		// task failed
   404  		r.logger.Error("run task failed",
   405  			zap.String("task", rt.task.DebugString()),
   406  			zap.Error(err))
   407  		if rt.canRetry() {
   408  			rt.retryTimes++
   409  			rt.retryAt = time.Now().Add(time.Duration(rt.task.Metadata.Options.RetryInterval))
   410  			if !r.addRetryTask(rt) {
   411  				// retry queue is full, let scheduler re-allocate.
   412  				r.removeRunningTask(rt.task.ID)
   413  				r.releaseParallel()
   414  			}
   415  			return
   416  		}
   417  		result.Code = task.ResultCode_Failed
   418  		result.Error = err.Error()
   419  	taskDone:
   420  		rt.task.ExecuteResult = result
   421  		r.addDoneTask(rt)
   422  	})
   423  	if err != nil {
   424  		r.logger.Error("run task failed", zap.Error(err))
   425  	}
   426  }
   427  
   428  func (r *taskRunner) addDoneTask(rt runningTask) {
   429  	r.releaseParallel()
   430  	r.doneC <- rt
   431  }
   432  
   433  func (r *taskRunner) addRetryTask(task runningTask) bool {
   434  	r.mu.Lock()
   435  	defer r.mu.Unlock()
   436  	if len(r.mu.retryTasks) >= r.options.maxWaitTasks {
   437  		return false
   438  	}
   439  
   440  	r.mu.retryTasks = append(r.mu.retryTasks, task)
   441  	sort.Slice(r.mu.retryTasks, func(i, j int) bool {
   442  		return r.mu.retryTasks[i].retryAt.Before(r.mu.retryTasks[j].retryAt)
   443  	})
   444  	return true
   445  }
   446  
   447  func (r *taskRunner) releaseParallel() {
   448  	// other task can execute
   449  	select {
   450  	case <-r.parallelismC:
   451  	default:
   452  		panic("BUG")
   453  	}
   454  }
   455  
   456  func (r *taskRunner) done(ctx context.Context) {
   457  	r.logger.Info("done task started")
   458  
   459  	for {
   460  		select {
   461  		case <-ctx.Done():
   462  			r.logger.Info("done task stopped")
   463  			return
   464  		case task := <-r.doneC:
   465  			if !taskFrameworkDisabled() {
   466  				r.doTaskDone(ctx, task)
   467  			}
   468  		}
   469  	}
   470  }
   471  
   472  func (r *taskRunner) doTaskDone(ctx context.Context, rt runningTask) bool {
   473  	for {
   474  		select {
   475  		case <-ctx.Done():
   476  			return false
   477  		case <-rt.ctx.Done():
   478  			return false
   479  		default:
   480  			err := r.service.Complete(rt.ctx, r.runnerID, rt.task, *rt.task.ExecuteResult)
   481  			if err == nil || moerr.IsMoErrCode(err, moerr.ErrInvalidTask) {
   482  				r.removeRunningTask(rt.task.ID)
   483  				return true
   484  			}
   485  
   486  			r.logger.Error("task done failed, retry later",
   487  				zap.String("task", rt.task.DebugString()),
   488  				zap.Error(err))
   489  			time.Sleep(r.options.retryInterval)
   490  		}
   491  	}
   492  }
   493  
   494  func (r *taskRunner) heartbeat(ctx context.Context) {
   495  	r.logger.Info("heartbeat task started")
   496  	timer := time.NewTimer(r.options.heartbeatInterval)
   497  	defer timer.Stop()
   498  
   499  	for {
   500  		select {
   501  		case <-ctx.Done():
   502  			r.logger.Info("heartbeat task stopped")
   503  			return
   504  		case <-timer.C:
   505  			if !taskFrameworkDisabled() {
   506  				r.doHeartbeat(ctx)
   507  			}
   508  		}
   509  		timer.Reset(r.options.heartbeatInterval)
   510  	}
   511  }
   512  
   513  func (r *taskRunner) doHeartbeat(ctx context.Context) {
   514  	r.mu.RLock()
   515  	tasks := make([]runningTask, 0, len(r.mu.runningTasks))
   516  	for _, rt := range r.mu.runningTasks {
   517  		tasks = append(tasks, rt)
   518  	}
   519  	r.mu.RUnlock()
   520  
   521  	for _, rt := range tasks {
   522  		if err := r.service.Heartbeat(ctx, rt.task); err != nil {
   523  			if moerr.IsMoErrCode(err, moerr.ErrInvalidTask) {
   524  				r.removeRunningTask(rt.task.ID)
   525  				rt.cancel()
   526  			}
   527  			r.logger.Error("task heartbeat failed", zap.Error(err))
   528  		}
   529  	}
   530  }
   531  
   532  func (r *taskRunner) removeRunningTask(id uint64) {
   533  	r.mu.Lock()
   534  	defer r.mu.Unlock()
   535  
   536  	delete(r.mu.runningTasks, id)
   537  }
   538  
   539  func (r *taskRunner) getExecutor(code task.TaskCode) (TaskExecutor, error) {
   540  	r.mu.RLock()
   541  	defer r.mu.RUnlock()
   542  
   543  	if executor, ok := r.mu.executors[code]; ok {
   544  		return executor, nil
   545  	}
   546  	return nil, moerr.NewInternalErrorNoCtx("executor with code %d not exists", code)
   547  }
   548  
   549  type runningTask struct {
   550  	task       task.Task
   551  	ctx        context.Context
   552  	cancel     context.CancelFunc
   553  	retryTimes uint32
   554  	retryAt    time.Time
   555  }
   556  
   557  func (rt runningTask) canRetry() bool {
   558  	return rt.retryTimes < rt.task.Metadata.Options.MaxRetryTimes
   559  }