github.com/matrixorigin/matrixone@v1.2.0/pkg/taskservice/task_runner.go (about) 1 // Copyright 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package taskservice 16 17 import ( 18 "context" 19 "runtime" 20 "sort" 21 "sync" 22 "sync/atomic" 23 "time" 24 25 "github.com/matrixorigin/matrixone/pkg/common/moerr" 26 "github.com/matrixorigin/matrixone/pkg/common/stopper" 27 "github.com/matrixorigin/matrixone/pkg/logutil" 28 "github.com/matrixorigin/matrixone/pkg/pb/task" 29 "go.uber.org/zap" 30 ) 31 32 // RunnerOption option for create task runner 33 type RunnerOption func(*taskRunner) 34 35 // WithRunnerLogger set logger 36 func WithRunnerLogger(logger *zap.Logger) RunnerOption { 37 return func(r *taskRunner) { 38 r.logger = logger 39 } 40 } 41 42 // WithRunnerFetchLimit set fetch tasks limit 43 func WithRunnerFetchLimit(limit int) RunnerOption { 44 return func(r *taskRunner) { 45 r.options.queryLimit = limit 46 } 47 } 48 49 // WithRunnerParallelism set the parallelism for execute tasks. 50 func WithRunnerParallelism(parallelism int) RunnerOption { 51 return func(r *taskRunner) { 52 r.options.parallelism = parallelism 53 } 54 } 55 56 // WithRunnerMaxWaitTasks set the maximum number of tasks waiting to be executed, more than that 57 // will block fetching tasks. 58 func WithRunnerMaxWaitTasks(maxWaitTasks int) RunnerOption { 59 return func(r *taskRunner) { 60 r.options.maxWaitTasks = maxWaitTasks 61 } 62 } 63 64 // WithRunnerFetchInterval set fetch tasks interval duration 65 func WithRunnerFetchInterval(interval time.Duration) RunnerOption { 66 return func(r *taskRunner) { 67 r.options.fetchInterval = interval 68 } 69 } 70 71 // WithRunnerFetchTimeout set fetch timeout 72 func WithRunnerFetchTimeout(timeout time.Duration) RunnerOption { 73 return func(r *taskRunner) { 74 r.options.fetchTimeout = timeout 75 } 76 } 77 78 // WithRunnerHeartbeatInterval set heartbeat duration 79 func WithRunnerHeartbeatInterval(interval time.Duration) RunnerOption { 80 return func(r *taskRunner) { 81 r.options.heartbeatInterval = interval 82 } 83 } 84 85 // WithRunnerHeartbeatTimeout set heartbeat timeout. 86 func WithRunnerHeartbeatTimeout(timeout time.Duration) RunnerOption { 87 return func(r *taskRunner) { 88 r.options.heartbeatTimeout = timeout 89 } 90 } 91 92 // WithOptions set all options needed by taskRunner 93 func WithOptions( 94 queryLimit int, 95 parallelism int, 96 maxWaitTasks int, 97 fetchInterval time.Duration, 98 fetchTimeout time.Duration, 99 retryInterval time.Duration, 100 heartbeatInterval time.Duration, 101 heartbeatTimeout time.Duration, 102 ) RunnerOption { 103 return func(r *taskRunner) { 104 r.options.queryLimit = queryLimit 105 r.options.parallelism = parallelism 106 r.options.maxWaitTasks = maxWaitTasks 107 r.options.fetchInterval = fetchInterval 108 r.options.fetchTimeout = fetchTimeout 109 r.options.retryInterval = retryInterval 110 r.options.heartbeatInterval = heartbeatInterval 111 r.options.heartbeatTimeout = heartbeatTimeout 112 } 113 } 114 115 // WithRunnerRetryInterval set retry interval duration for operation 116 func WithRunnerRetryInterval(interval time.Duration) RunnerOption { 117 return func(r *taskRunner) { 118 r.options.retryInterval = interval 119 } 120 } 121 122 type taskRunner struct { 123 logger *zap.Logger 124 runnerID string 125 service TaskService 126 stopper stopper.Stopper 127 waitTasksC chan runningTask 128 parallelismC chan struct{} 129 doneC chan runningTask 130 131 started atomic.Bool 132 133 executors struct { 134 sync.RWMutex 135 m map[task.TaskCode]TaskExecutor 136 } 137 138 runningTasks struct { 139 sync.RWMutex 140 m map[uint64]runningTask 141 142 completedTasks map[uint64]struct{} 143 } 144 145 retryTasks struct { 146 sync.Mutex 147 s []runningTask 148 } 149 150 // accountID indicates the runner belongs to the account. 151 canClaimDaemonTask func(string) bool 152 153 pendingTaskHandle chan TaskHandler 154 // daemonTasks contains all daemon tasks that run on this node. 155 daemonTasks struct { 156 sync.Mutex 157 m map[uint64]*daemonTask 158 } 159 160 options struct { 161 queryLimit int 162 parallelism int 163 maxWaitTasks int 164 fetchInterval time.Duration 165 fetchTimeout time.Duration 166 retryInterval time.Duration 167 heartbeatInterval time.Duration 168 heartbeatTimeout time.Duration 169 } 170 } 171 172 // NewTaskRunner new task runner. The TaskRunner can be created by CN nodes and pull tasks from TaskService to 173 // execute periodically. 174 func NewTaskRunner(runnerID string, service TaskService, claimFn func(string) bool, opts ...RunnerOption) TaskRunner { 175 r := &taskRunner{ 176 runnerID: runnerID, 177 service: service, 178 // set the claim checker function for daemon task. 179 canClaimDaemonTask: claimFn, 180 } 181 r.executors.m = make(map[task.TaskCode]TaskExecutor) 182 for _, opt := range opts { 183 opt(r) 184 } 185 r.adjust() 186 187 r.logger = logutil.Adjust(r.logger).Named("task-runner").With(zap.String("runner-id", r.runnerID)) 188 r.stopper = *stopper.NewStopper("task-runner", stopper.WithLogger(r.logger)) 189 r.parallelismC = make(chan struct{}, r.options.parallelism) 190 r.waitTasksC = make(chan runningTask, r.options.maxWaitTasks) 191 r.doneC = make(chan runningTask, r.options.maxWaitTasks) 192 r.runningTasks.m = make(map[uint64]runningTask) 193 r.runningTasks.completedTasks = make(map[uint64]struct{}) 194 r.pendingTaskHandle = make(chan TaskHandler, 20) 195 r.daemonTasks.m = make(map[uint64]*daemonTask) 196 return r 197 } 198 199 func (r *taskRunner) adjust() { 200 if r.options.parallelism == 0 { 201 r.options.parallelism = runtime.NumCPU() / 4 202 if r.options.parallelism == 0 { 203 r.options.parallelism = 1 204 } 205 } 206 if r.options.fetchInterval == 0 { 207 r.options.fetchInterval = time.Second * 10 208 } 209 if r.options.fetchTimeout == 0 { 210 r.options.fetchTimeout = time.Second * 10 211 } 212 if r.options.heartbeatInterval == 0 { 213 r.options.heartbeatInterval = time.Second * 5 214 } 215 if r.options.heartbeatTimeout == 0 { 216 r.options.heartbeatTimeout = time.Second * 30 217 } 218 if r.options.maxWaitTasks == 0 { 219 r.options.maxWaitTasks = 256 220 } 221 if r.options.queryLimit == 0 { 222 r.options.queryLimit = r.options.parallelism 223 } 224 if r.options.retryInterval == 0 { 225 r.options.retryInterval = time.Second 226 } 227 } 228 229 func (r *taskRunner) ID() string { 230 return r.runnerID 231 } 232 233 func (r *taskRunner) Start() error { 234 if !r.started.CompareAndSwap(false, true) { 235 return nil 236 } 237 if err := r.startAsyncTaskWorker(); err != nil { 238 return err 239 } 240 if err := r.startDaemonTaskWorker(); err != nil { 241 return err 242 } 243 return nil 244 } 245 246 func (r *taskRunner) startAsyncTaskWorker() error { 247 if err := r.stopper.RunNamedTask("fetch-task", r.fetch); err != nil { 248 return err 249 } 250 if err := r.stopper.RunNamedTask("dispatch-task", r.dispatch); err != nil { 251 return err 252 } 253 if err := r.stopper.RunNamedTask("done-task", r.done); err != nil { 254 return err 255 } 256 if err := r.stopper.RunNamedTask("heartbeat-task", r.heartbeat); err != nil { 257 return err 258 } 259 if err := r.stopper.RunNamedTask("retry-task", r.retry); err != nil { 260 return err 261 } 262 return nil 263 } 264 265 func (r *taskRunner) Stop() error { 266 if !r.started.CompareAndSwap(true, false) { 267 return nil 268 } 269 270 r.stopper.Stop() 271 close(r.waitTasksC) 272 close(r.parallelismC) 273 close(r.doneC) 274 return nil 275 } 276 277 func (r *taskRunner) Parallelism() int { 278 return r.options.parallelism 279 } 280 281 func (r *taskRunner) RegisterExecutor(code task.TaskCode, executor TaskExecutor) { 282 r.executors.Lock() 283 defer r.executors.Unlock() 284 285 if _, ok := r.executors.m[code]; !ok { 286 r.logger.Debug("executor registered", zap.Any("code", code)) 287 r.executors.m[code] = executor 288 } 289 } 290 291 func (r *taskRunner) GetExecutor(code task.TaskCode) TaskExecutor { 292 r.executors.RLock() 293 defer r.executors.RUnlock() 294 295 if executor, ok := r.executors.m[code]; ok { 296 return executor 297 } 298 299 return nil 300 } 301 302 func (r *taskRunner) Attach(ctx context.Context, taskID uint64, routine ActiveRoutine) error { 303 r.daemonTasks.Lock() 304 defer r.daemonTasks.Unlock() 305 t, ok := r.daemonTasks.m[taskID] 306 if !ok { 307 return moerr.NewErrTaskNotFound(ctx, taskID) 308 } 309 t.activeRoutine.Store(&routine) 310 return nil 311 } 312 313 func (r *taskRunner) fetch(ctx context.Context) { 314 r.logger.Debug("fetch task started") 315 ticker := time.NewTicker(r.options.fetchInterval) 316 defer ticker.Stop() 317 318 for { 319 select { 320 case <-ctx.Done(): 321 r.logger.Debug("fetch task stopped") 322 return 323 case <-ticker.C: 324 if taskFrameworkDisabled() { 325 continue 326 } 327 tasks, err := r.doFetch() 328 if err != nil { 329 r.logger.Error("fetch task failed", zap.Error(err)) 330 break 331 } 332 for _, t := range tasks { 333 r.addToWait(ctx, t) 334 } 335 } 336 } 337 } 338 339 func (r *taskRunner) doFetch() ([]task.AsyncTask, error) { 340 ctx, cancel := context.WithTimeout(context.Background(), r.options.fetchTimeout) 341 tasks, err := r.service.QueryAsyncTask(ctx, 342 WithTaskStatusCond(task.TaskStatus_Running), 343 WithLimitCond(r.options.queryLimit), 344 WithTaskRunnerCond(EQ, r.runnerID)) 345 cancel() 346 if err != nil { 347 return nil, err 348 } 349 newTasks := tasks[:0] 350 r.runningTasks.Lock() 351 for _, t := range tasks { 352 if _, ok := r.runningTasks.m[t.ID]; !ok { 353 if _, ok := r.runningTasks.completedTasks[t.ID]; !ok { 354 r.logger.Info("new task fetched", 355 zap.String("task", t.DebugString())) 356 newTasks = append(newTasks, t) 357 } 358 } 359 } 360 for k := range r.runningTasks.completedTasks { 361 delete(r.runningTasks.completedTasks, k) 362 } 363 r.runningTasks.Unlock() 364 365 if len(newTasks) == 0 { 366 return nil, nil 367 } 368 369 return newTasks, nil 370 } 371 372 func (r *taskRunner) addToWait(ctx context.Context, task task.AsyncTask) bool { 373 ctx2, cancel := context.WithCancel(ctx) 374 rt := runningTask{ 375 task: task, 376 ctx: ctx2, 377 cancel: cancel, 378 } 379 380 select { 381 case <-ctx.Done(): 382 return false 383 case r.waitTasksC <- rt: 384 r.runningTasks.Lock() 385 r.runningTasks.m[task.ID] = rt 386 r.runningTasks.Unlock() 387 r.logger.Info("task added to wait queue", 388 zap.String("task", task.DebugString())) 389 return true 390 } 391 } 392 393 func (r *taskRunner) dispatch(ctx context.Context) { 394 r.logger.Debug("dispatch task started") 395 396 for { 397 select { 398 case <-ctx.Done(): 399 r.logger.Debug("dispatch task stopped") 400 return 401 case rt := <-r.waitTasksC: 402 if taskFrameworkDisabled() { 403 continue 404 } 405 r.runTask(ctx, rt) 406 } 407 } 408 } 409 410 func (r *taskRunner) retry(ctx context.Context) { 411 r.logger.Debug("retry task started") 412 ticker := time.NewTicker(100 * time.Millisecond) 413 defer ticker.Stop() 414 415 var needRetryTasks []runningTask 416 for { 417 select { 418 case <-ctx.Done(): 419 r.logger.Debug("retry task stopped") 420 return 421 case <-ticker.C: 422 if taskFrameworkDisabled() { 423 continue 424 } 425 needRetryTasks = needRetryTasks[:0] 426 r.retryTasks.Lock() 427 for i, rt := range r.retryTasks.s { 428 if rt.retryAt.After(time.Now()) { 429 r.retryTasks.s = r.retryTasks.s[:copy(r.retryTasks.s, r.retryTasks.s[i:])] 430 break 431 } 432 needRetryTasks = append(needRetryTasks, rt) 433 } 434 r.retryTasks.Unlock() 435 for _, rt := range needRetryTasks { 436 r.runTask(ctx, rt) 437 } 438 } 439 } 440 } 441 442 func (r *taskRunner) runTask(ctx context.Context, rt runningTask) { 443 select { 444 case <-ctx.Done(): 445 case r.parallelismC <- struct{}{}: 446 r.run(rt) 447 } 448 } 449 450 func (r *taskRunner) run(rt runningTask) { 451 err := r.stopper.RunTask(func(ctx context.Context) { 452 start := time.Now() 453 r.logger.Debug("task start execute", 454 zap.String("task", rt.task.DebugString())) 455 defer func() { 456 r.logger.Debug("task execute completed", 457 zap.String("task", rt.task.DebugString()), 458 zap.Duration("cost", time.Since(start))) 459 }() 460 461 if executor, err := r.getExecutor(rt.task.Metadata.Executor); err != nil { 462 r.taskExecResult(rt, err, false) 463 } else if err := executor(rt.ctx, &rt.task); err != nil { 464 r.taskExecResult(rt, err, true) 465 } else { 466 r.taskExecResult(rt, nil, false) 467 } 468 }) 469 if err != nil { 470 r.logger.Error("run task failed", zap.Error(err)) 471 } 472 } 473 474 func (r *taskRunner) taskExecResult(rt runningTask, err error, mayRetry bool) { 475 if err == nil { 476 rt.task.ExecuteResult = &task.ExecuteResult{ 477 Code: task.ResultCode_Success, 478 } 479 } else { 480 r.logger.Error("run task failed", 481 zap.String("task", rt.task.DebugString()), 482 zap.Error(err)) 483 rt.task.ExecuteResult = &task.ExecuteResult{ 484 Code: task.ResultCode_Failed, 485 Error: err.Error(), 486 } 487 } 488 489 if mayRetry && rt.canRetry() { 490 rt.retryTimes++ 491 rt.retryAt = time.Now().Add(time.Duration(rt.task.Metadata.Options.RetryInterval)) 492 if !r.addRetryTask(rt) { 493 // retry queue is full, let scheduler re-allocate. 494 r.removeRunningTask(rt.task.ID) 495 r.releaseParallel() 496 } 497 return 498 } 499 r.addDoneTask(rt) 500 } 501 502 func (r *taskRunner) addDoneTask(rt runningTask) { 503 r.releaseParallel() 504 r.doneC <- rt 505 } 506 507 func (r *taskRunner) addRetryTask(task runningTask) bool { 508 r.retryTasks.Lock() 509 defer r.retryTasks.Unlock() 510 if len(r.retryTasks.s) >= r.options.maxWaitTasks { 511 return false 512 } 513 514 r.retryTasks.s = append(r.retryTasks.s, task) 515 sort.Slice(r.retryTasks.s, func(i, j int) bool { 516 return r.retryTasks.s[i].retryAt.Before(r.retryTasks.s[j].retryAt) 517 }) 518 return true 519 } 520 521 func (r *taskRunner) releaseParallel() { 522 // other task can execute 523 select { 524 case <-r.parallelismC: 525 default: 526 panic("BUG") 527 } 528 } 529 530 func (r *taskRunner) done(ctx context.Context) { 531 r.logger.Debug("done task started") 532 533 for { 534 select { 535 case <-ctx.Done(): 536 r.logger.Debug("done task stopped") 537 return 538 case rt := <-r.doneC: 539 if taskFrameworkDisabled() { 540 continue 541 } 542 r.doTaskDone(ctx, rt) 543 } 544 } 545 } 546 547 func (r *taskRunner) doTaskDone(ctx context.Context, rt runningTask) bool { 548 for { 549 select { 550 case <-ctx.Done(): 551 return false 552 case <-rt.ctx.Done(): 553 return false 554 default: 555 err := r.service.Complete(rt.ctx, r.runnerID, rt.task, *rt.task.ExecuteResult) 556 if err == nil || moerr.IsMoErrCode(err, moerr.ErrInvalidTask) { 557 r.removeRunningTask(rt.task.ID) 558 r.logger.Info("task completed", 559 zap.String("task", rt.task.DebugString()), 560 zap.Error(err)) 561 return true 562 } 563 564 r.logger.Error("task done failed, retry later", 565 zap.String("task", rt.task.DebugString()), 566 zap.Error(err)) 567 time.Sleep(r.options.retryInterval) 568 } 569 } 570 } 571 572 func (r *taskRunner) heartbeat(ctx context.Context) { 573 r.logger.Debug("heartbeat task started") 574 ticker := time.NewTicker(r.options.heartbeatInterval) 575 defer ticker.Stop() 576 577 for { 578 select { 579 case <-ctx.Done(): 580 r.logger.Debug("heartbeat task stopped") 581 return 582 case <-ticker.C: 583 if taskFrameworkDisabled() { 584 continue 585 } 586 r.doHeartbeat(ctx) 587 } 588 } 589 } 590 591 func (r *taskRunner) doHeartbeat(ctx context.Context) { 592 r.runningTasks.RLock() 593 tasks := make([]runningTask, 0, len(r.runningTasks.m)) 594 for _, rt := range r.runningTasks.m { 595 tasks = append(tasks, rt) 596 } 597 r.runningTasks.RUnlock() 598 599 for _, rt := range tasks { 600 if err := r.service.Heartbeat(ctx, rt.task); err != nil { 601 if moerr.IsMoErrCode(err, moerr.ErrInvalidTask) { 602 r.removeRunningTask(rt.task.ID) 603 rt.cancel() 604 } 605 r.logger.Error("task heartbeat failed", 606 zap.String("task", rt.task.DebugString()), 607 zap.Error(err)) 608 } 609 } 610 } 611 612 func (r *taskRunner) removeRunningTask(id uint64) { 613 r.runningTasks.Lock() 614 defer r.runningTasks.Unlock() 615 delete(r.runningTasks.m, id) 616 r.runningTasks.completedTasks[id] = struct{}{} 617 r.logger.Info("task removed", zap.Uint64("task-id", id)) 618 } 619 620 func (r *taskRunner) getExecutor(code task.TaskCode) (TaskExecutor, error) { 621 r.executors.RLock() 622 defer r.executors.RUnlock() 623 624 if executor, ok := r.executors.m[code]; ok { 625 return executor, nil 626 } 627 return nil, moerr.NewInternalErrorNoCtx("executor with code %d not exists", code) 628 } 629 630 type runningTask struct { 631 task task.AsyncTask 632 ctx context.Context 633 cancel context.CancelFunc 634 retryTimes uint32 635 retryAt time.Time 636 } 637 638 func (rt runningTask) canRetry() bool { 639 return rt.retryTimes < rt.task.Metadata.Options.MaxRetryTimes 640 }