github.com/matrixorigin/matrixone@v0.7.0/pkg/taskservice/task_runner.go (about) 1 // Copyright 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package taskservice 16 17 import ( 18 "context" 19 "runtime" 20 "sort" 21 "sync" 22 "time" 23 24 "github.com/matrixorigin/matrixone/pkg/common/moerr" 25 "github.com/matrixorigin/matrixone/pkg/common/stopper" 26 "github.com/matrixorigin/matrixone/pkg/logutil" 27 "github.com/matrixorigin/matrixone/pkg/pb/task" 28 "go.uber.org/zap" 29 ) 30 31 // RunnerOption option for create task runner 32 type RunnerOption func(*taskRunner) 33 34 // WithRunnerLogger set logger 35 func WithRunnerLogger(logger *zap.Logger) RunnerOption { 36 return func(r *taskRunner) { 37 r.logger = logger 38 } 39 } 40 41 // WithRunnerFetchLimit set fetch tasks limit 42 func WithRunnerFetchLimit(limit int) RunnerOption { 43 return func(r *taskRunner) { 44 r.options.queryLimit = limit 45 } 46 } 47 48 // WithRunnerParallelism set the parallelism for execute tasks. 49 func WithRunnerParallelism(parallelism int) RunnerOption { 50 return func(r *taskRunner) { 51 r.options.parallelism = parallelism 52 } 53 } 54 55 // WithRunnerMaxWaitTasks set the maximum number of tasks waiting to be executed, more than that 56 // will block fetching tasks. 57 func WithRunnerMaxWaitTasks(maxWaitTasks int) RunnerOption { 58 return func(r *taskRunner) { 59 r.options.maxWaitTasks = maxWaitTasks 60 } 61 } 62 63 // WithRunnerFetchInterval set fetch tasks interval duration 64 func WithRunnerFetchInterval(interval time.Duration) RunnerOption { 65 return func(r *taskRunner) { 66 r.options.fetchInterval = interval 67 } 68 } 69 70 // WithRunnerFetchTimeout set fetch timeout 71 func WithRunnerFetchTimeout(timeout time.Duration) RunnerOption { 72 return func(r *taskRunner) { 73 r.options.fetchTimeout = timeout 74 } 75 } 76 77 // WithRunnerHeartbeatInterval set heartbeat duration 78 func WithRunnerHeartbeatInterval(interval time.Duration) RunnerOption { 79 return func(r *taskRunner) { 80 r.options.heartbeatInterval = interval 81 } 82 } 83 84 // WithOptions set all options needed by taskRunner 85 func WithOptions( 86 queryLimit int, 87 parallelism int, 88 maxWaitTasks int, 89 fetchInterval time.Duration, 90 fetchTimeout time.Duration, 91 retryInterval time.Duration, 92 heartbeatInterval time.Duration, 93 ) RunnerOption { 94 return func(r *taskRunner) { 95 r.options.queryLimit = queryLimit 96 r.options.parallelism = parallelism 97 r.options.maxWaitTasks = maxWaitTasks 98 r.options.fetchInterval = fetchInterval 99 r.options.fetchTimeout = fetchTimeout 100 r.options.retryInterval = retryInterval 101 r.options.heartbeatInterval = heartbeatInterval 102 } 103 } 104 105 // WithRunnerRetryInterval set retry interval duration for operation 106 func WithRunnerRetryInterval(interval time.Duration) RunnerOption { 107 return func(r *taskRunner) { 108 r.options.retryInterval = interval 109 } 110 } 111 112 type taskRunner struct { 113 logger *zap.Logger 114 runnerID string 115 service TaskService 116 stopper stopper.Stopper 117 lastTaskID uint64 118 waitTasksC chan task.Task 119 parallelismC chan struct{} 120 doneC chan runningTask 121 122 mu struct { 123 sync.RWMutex 124 started bool 125 executors map[task.TaskCode]TaskExecutor 126 runningTasks map[uint64]runningTask 127 retryTasks []runningTask 128 } 129 130 options struct { 131 queryLimit int 132 parallelism int 133 maxWaitTasks int 134 fetchInterval time.Duration 135 fetchTimeout time.Duration 136 retryInterval time.Duration 137 heartbeatInterval time.Duration 138 } 139 } 140 141 // NewTaskRunner new task runner. The TaskRunner can be created by CN nodes and pull tasks from TaskService to 142 // execute periodically. 143 func NewTaskRunner(runnerID string, service TaskService, opts ...RunnerOption) TaskRunner { 144 r := &taskRunner{ 145 runnerID: runnerID, 146 service: service, 147 } 148 r.mu.executors = make(map[task.TaskCode]TaskExecutor) 149 for _, opt := range opts { 150 opt(r) 151 } 152 r.adjust() 153 154 r.logger = logutil.Adjust(r.logger).Named("task-runner").With(zap.String("runner-id", r.runnerID)) 155 r.stopper = *stopper.NewStopper("task-runner", stopper.WithLogger(r.logger)) 156 r.parallelismC = make(chan struct{}, r.options.parallelism) 157 r.waitTasksC = make(chan task.Task, r.options.maxWaitTasks) 158 r.doneC = make(chan runningTask, r.options.maxWaitTasks) 159 r.mu.runningTasks = make(map[uint64]runningTask) 160 return r 161 } 162 163 func (r *taskRunner) adjust() { 164 if r.options.parallelism == 0 { 165 r.options.parallelism = runtime.NumCPU() / 16 166 if r.options.parallelism == 0 { 167 r.options.parallelism = 1 168 } 169 } 170 if r.options.fetchInterval == 0 { 171 r.options.fetchInterval = time.Second * 10 172 } 173 if r.options.fetchTimeout == 0 { 174 r.options.fetchTimeout = time.Second * 5 175 } 176 if r.options.heartbeatInterval == 0 { 177 r.options.heartbeatInterval = time.Second * 5 178 } 179 if r.options.maxWaitTasks == 0 { 180 r.options.maxWaitTasks = 256 181 } 182 if r.options.queryLimit == 0 { 183 r.options.queryLimit = r.options.parallelism 184 } 185 if r.options.retryInterval == 0 { 186 r.options.retryInterval = time.Second 187 } 188 } 189 190 func (r *taskRunner) ID() string { 191 return r.runnerID 192 } 193 194 func (r *taskRunner) Start() error { 195 r.mu.Lock() 196 defer r.mu.Unlock() 197 198 if r.mu.started { 199 return nil 200 } 201 202 r.mu.started = true 203 204 if err := r.stopper.RunNamedTask("fetch-task", r.fetch); err != nil { 205 return err 206 } 207 if err := r.stopper.RunNamedTask("dispatch-task", r.dispatch); err != nil { 208 return err 209 } 210 if err := r.stopper.RunNamedTask("done-task", r.done); err != nil { 211 return err 212 } 213 if err := r.stopper.RunNamedTask("heartbeat-task", r.heartbeat); err != nil { 214 return err 215 } 216 if err := r.stopper.RunNamedTask("retry-task", r.retry); err != nil { 217 return err 218 } 219 return nil 220 } 221 222 func (r *taskRunner) Stop() error { 223 r.mu.Lock() 224 if !r.mu.started { 225 r.mu.Unlock() 226 return nil 227 } 228 r.mu.started = false 229 r.mu.Unlock() 230 231 r.stopper.Stop() 232 close(r.waitTasksC) 233 close(r.parallelismC) 234 close(r.doneC) 235 return nil 236 } 237 238 func (r *taskRunner) Parallelism() int { 239 return r.options.parallelism 240 } 241 242 func (r *taskRunner) RegisterExecutor(code task.TaskCode, executor TaskExecutor) { 243 r.mu.Lock() 244 defer r.mu.Unlock() 245 246 if _, ok := r.mu.executors[code]; !ok { 247 r.logger.Debug("executor registered", zap.Any("code", code)) 248 r.mu.executors[code] = executor 249 } 250 } 251 252 func (r *taskRunner) fetch(ctx context.Context) { 253 r.logger.Info("fetch task started") 254 timer := time.NewTimer(r.options.fetchInterval) 255 defer timer.Stop() 256 257 for { 258 select { 259 case <-ctx.Done(): 260 r.logger.Info("fetch task stopped") 261 return 262 case <-timer.C: 263 if !taskFrameworkDisabled() { 264 tasks, err := r.doFetch() 265 if err != nil { 266 break 267 } 268 r.addTasks(ctx, tasks) 269 } 270 } 271 timer.Reset(r.options.fetchInterval) 272 } 273 } 274 275 func (r *taskRunner) doFetch() ([]task.Task, error) { 276 ctx, cancel := context.WithTimeout(context.Background(), r.options.fetchTimeout) 277 tasks, err := r.service.QueryTask(ctx, 278 WithTaskIDCond(GT, r.lastTaskID), 279 WithLimitCond(r.options.queryLimit), 280 WithTaskRunnerCond(EQ, r.runnerID)) 281 cancel() 282 if err != nil { 283 r.logger.Error("fetch task failed", zap.Error(err)) 284 return nil, err 285 } 286 if len(tasks) == 0 { 287 return nil, nil 288 } 289 290 r.lastTaskID = tasks[len(tasks)-1].ID 291 r.logger.Debug("new task fetched", 292 zap.Int("count", len(tasks)), 293 zap.Uint64("last-task-id", r.lastTaskID)) 294 return tasks, nil 295 } 296 297 func (r *taskRunner) addTasks(ctx context.Context, tasks []task.Task) { 298 for _, task := range tasks { 299 r.addToWait(ctx, task) 300 } 301 } 302 303 func (r *taskRunner) addToWait(ctx context.Context, task task.Task) bool { 304 select { 305 case <-ctx.Done(): 306 return false 307 case r.waitTasksC <- task: 308 r.logger.Debug("task added", zap.String("task", task.DebugString())) 309 return true 310 } 311 } 312 313 func (r *taskRunner) dispatch(ctx context.Context) { 314 r.logger.Info("dispatch task started") 315 316 for { 317 select { 318 case <-ctx.Done(): 319 r.logger.Info("dispatch task stopped") 320 return 321 case task := <-r.waitTasksC: 322 if !taskFrameworkDisabled() { 323 r.runTask(ctx, task) 324 } 325 } 326 } 327 } 328 329 func (r *taskRunner) retry(ctx context.Context) { 330 r.logger.Info("retry task started") 331 timer := time.NewTimer(time.Second) 332 defer timer.Stop() 333 334 var needRetryTasks []runningTask 335 for { 336 select { 337 case <-ctx.Done(): 338 r.logger.Info("retry task stopped") 339 return 340 case <-timer.C: 341 if !taskFrameworkDisabled() { 342 now := time.Now() 343 needRetryTasks = needRetryTasks[:0] 344 r.mu.Lock() 345 for idx, rt := range r.mu.retryTasks { 346 if rt.retryAt.After(now) { 347 r.mu.retryTasks = r.mu.retryTasks[:copy(r.mu.retryTasks, r.mu.retryTasks[idx:])] 348 break 349 } 350 needRetryTasks = append(needRetryTasks, rt) 351 } 352 r.mu.Unlock() 353 if len(needRetryTasks) > 0 { 354 for _, rt := range needRetryTasks { 355 r.runTask(ctx, rt) 356 } 357 } 358 } 359 } 360 timer.Reset(time.Millisecond * 100) 361 } 362 } 363 364 func (r *taskRunner) runTask(ctx context.Context, value any) bool { 365 select { 366 case <-ctx.Done(): 367 return false 368 case r.parallelismC <- struct{}{}: 369 var rt runningTask 370 switch value := value.(type) { 371 case task.Task: 372 rt = runningTask{task: value} 373 rt.ctx, rt.cancel = context.WithCancel(ctx) 374 r.mu.Lock() 375 r.mu.runningTasks[rt.task.ID] = rt 376 r.mu.Unlock() 377 case runningTask: 378 rt = value 379 } 380 381 r.run(rt) 382 return true 383 } 384 } 385 386 func (r *taskRunner) run(rt runningTask) { 387 err := r.stopper.RunTask(func(ctx context.Context) { 388 start := time.Now() 389 r.logger.Debug("task start execute", 390 zap.String("task", rt.task.DebugString())) 391 defer r.logger.Debug("task execute completed", 392 zap.String("task", rt.task.DebugString()), 393 zap.Duration("cost", time.Since(start))) 394 395 executor, err := r.getExecutor(rt.task.Metadata.Executor) 396 result := &task.ExecuteResult{Code: task.ResultCode_Success} 397 if err == nil { 398 if err = executor(rt.ctx, rt.task); err == nil { 399 goto taskDone 400 } 401 } 402 403 // task failed 404 r.logger.Error("run task failed", 405 zap.String("task", rt.task.DebugString()), 406 zap.Error(err)) 407 if rt.canRetry() { 408 rt.retryTimes++ 409 rt.retryAt = time.Now().Add(time.Duration(rt.task.Metadata.Options.RetryInterval)) 410 if !r.addRetryTask(rt) { 411 // retry queue is full, let scheduler re-allocate. 412 r.removeRunningTask(rt.task.ID) 413 r.releaseParallel() 414 } 415 return 416 } 417 result.Code = task.ResultCode_Failed 418 result.Error = err.Error() 419 taskDone: 420 rt.task.ExecuteResult = result 421 r.addDoneTask(rt) 422 }) 423 if err != nil { 424 r.logger.Error("run task failed", zap.Error(err)) 425 } 426 } 427 428 func (r *taskRunner) addDoneTask(rt runningTask) { 429 r.releaseParallel() 430 r.doneC <- rt 431 } 432 433 func (r *taskRunner) addRetryTask(task runningTask) bool { 434 r.mu.Lock() 435 defer r.mu.Unlock() 436 if len(r.mu.retryTasks) >= r.options.maxWaitTasks { 437 return false 438 } 439 440 r.mu.retryTasks = append(r.mu.retryTasks, task) 441 sort.Slice(r.mu.retryTasks, func(i, j int) bool { 442 return r.mu.retryTasks[i].retryAt.Before(r.mu.retryTasks[j].retryAt) 443 }) 444 return true 445 } 446 447 func (r *taskRunner) releaseParallel() { 448 // other task can execute 449 select { 450 case <-r.parallelismC: 451 default: 452 panic("BUG") 453 } 454 } 455 456 func (r *taskRunner) done(ctx context.Context) { 457 r.logger.Info("done task started") 458 459 for { 460 select { 461 case <-ctx.Done(): 462 r.logger.Info("done task stopped") 463 return 464 case task := <-r.doneC: 465 if !taskFrameworkDisabled() { 466 r.doTaskDone(ctx, task) 467 } 468 } 469 } 470 } 471 472 func (r *taskRunner) doTaskDone(ctx context.Context, rt runningTask) bool { 473 for { 474 select { 475 case <-ctx.Done(): 476 return false 477 case <-rt.ctx.Done(): 478 return false 479 default: 480 err := r.service.Complete(rt.ctx, r.runnerID, rt.task, *rt.task.ExecuteResult) 481 if err == nil || moerr.IsMoErrCode(err, moerr.ErrInvalidTask) { 482 r.removeRunningTask(rt.task.ID) 483 return true 484 } 485 486 r.logger.Error("task done failed, retry later", 487 zap.String("task", rt.task.DebugString()), 488 zap.Error(err)) 489 time.Sleep(r.options.retryInterval) 490 } 491 } 492 } 493 494 func (r *taskRunner) heartbeat(ctx context.Context) { 495 r.logger.Info("heartbeat task started") 496 timer := time.NewTimer(r.options.heartbeatInterval) 497 defer timer.Stop() 498 499 for { 500 select { 501 case <-ctx.Done(): 502 r.logger.Info("heartbeat task stopped") 503 return 504 case <-timer.C: 505 if !taskFrameworkDisabled() { 506 r.doHeartbeat(ctx) 507 } 508 } 509 timer.Reset(r.options.heartbeatInterval) 510 } 511 } 512 513 func (r *taskRunner) doHeartbeat(ctx context.Context) { 514 r.mu.RLock() 515 tasks := make([]runningTask, 0, len(r.mu.runningTasks)) 516 for _, rt := range r.mu.runningTasks { 517 tasks = append(tasks, rt) 518 } 519 r.mu.RUnlock() 520 521 for _, rt := range tasks { 522 if err := r.service.Heartbeat(ctx, rt.task); err != nil { 523 if moerr.IsMoErrCode(err, moerr.ErrInvalidTask) { 524 r.removeRunningTask(rt.task.ID) 525 rt.cancel() 526 } 527 r.logger.Error("task heartbeat failed", zap.Error(err)) 528 } 529 } 530 } 531 532 func (r *taskRunner) removeRunningTask(id uint64) { 533 r.mu.Lock() 534 defer r.mu.Unlock() 535 536 delete(r.mu.runningTasks, id) 537 } 538 539 func (r *taskRunner) getExecutor(code task.TaskCode) (TaskExecutor, error) { 540 r.mu.RLock() 541 defer r.mu.RUnlock() 542 543 if executor, ok := r.mu.executors[code]; ok { 544 return executor, nil 545 } 546 return nil, moerr.NewInternalErrorNoCtx("executor with code %d not exists", code) 547 } 548 549 type runningTask struct { 550 task task.Task 551 ctx context.Context 552 cancel context.CancelFunc 553 retryTimes uint32 554 retryAt time.Time 555 } 556 557 func (rt runningTask) canRetry() bool { 558 return rt.retryTimes < rt.task.Metadata.Options.MaxRetryTimes 559 }