github.com/wfusion/gofusion@v1.1.14/common/infra/asynq/server.go (about) 1 // Copyright 2020 Kentaro Hibino. All rights reserved. 2 // Use of this source code is governed by a MIT license 3 // that can be found in the LICENSE file. 4 5 package asynq 6 7 import ( 8 "context" 9 "errors" 10 "fmt" 11 "math" 12 "math/rand" 13 "runtime" 14 "strings" 15 "sync" 16 "time" 17 18 "github.com/redis/go-redis/v9" 19 "github.com/wfusion/gofusion/common/infra/asynq/pkg/base" 20 "github.com/wfusion/gofusion/common/infra/asynq/pkg/log" 21 "github.com/wfusion/gofusion/common/infra/asynq/pkg/rdb" 22 ) 23 24 // Server is responsible for task processing and task lifecycle management. 25 // 26 // Server pulls tasks off queues and processes them. 27 // If the processing of a task is unsuccessful, server will schedule it for a retry. 28 // 29 // A task will be retried until either the task gets processed successfully 30 // or until it reaches its max retry count. 31 // 32 // If a task exhausts its retries, it will be moved to the archive and 33 // will be kept in the archive set. 34 // Note that the archive size is finite and once it reaches its max size, 35 // oldest tasks in the archive will be deleted. 36 type Server struct { 37 logger *log.Logger 38 39 broker base.Broker 40 41 state *serverState 42 43 disableRedisConnClose bool 44 45 // wait group to wait for all goroutines to finish. 46 wg sync.WaitGroup 47 forwarder *forwarder 48 processor *processor 49 syncer *syncer 50 heartbeater *heartbeater 51 subscriber *subscriber 52 recoverer *recoverer 53 healthchecker *healthchecker 54 janitor *janitor 55 aggregator *aggregator 56 } 57 58 type serverState struct { 59 mu sync.Mutex 60 value serverStateValue 61 } 62 63 type serverStateValue int 64 65 const ( 66 // StateNew represents a new server. Server begins in 67 // this state and then transition to StatusActive when 68 // Start or Run is callled. 69 srvStateNew serverStateValue = iota 70 71 // StateActive indicates the server is up and active. 72 srvStateActive 73 74 // StateStopped indicates the server is up but no longer processing new tasks. 75 srvStateStopped 76 77 // StateClosed indicates the server has been shutdown. 78 srvStateClosed 79 ) 80 81 var serverStates = []string{ 82 "new", 83 "active", 84 "stopped", 85 "closed", 86 } 87 88 func (s serverStateValue) String() string { 89 if srvStateNew <= s && s <= srvStateClosed { 90 return serverStates[s] 91 } 92 return "unknown status" 93 } 94 95 // Config specifies the server's background-task processing behavior. 96 type Config struct { 97 // Maximum number of concurrent processing of tasks. 98 // 99 // If set to a zero or negative value, NewServer will overwrite the value 100 // to the number of CPUs usable by the current process. 101 Concurrency int 102 103 // BaseContext optionally specifies a function that returns the base context for Handler invocations on this server. 104 // 105 // If BaseContext is nil, the default is context.Background(). 106 // If this is defined, then it MUST return a non-nil context 107 BaseContext func() context.Context 108 109 // Function to calculate retry delay for a failed task. 110 // 111 // By default, it uses exponential backoff algorithm to calculate the delay. 112 RetryDelayFunc RetryDelayFunc 113 114 // Predicate function to determine whether the error returned from Handler is a failure. 115 // If the function returns false, Server will not increment the retried counter for the task, 116 // and Server won't record the queue stats (processed and failed stats) to avoid skewing the error 117 // rate of the queue. 118 // 119 // By default, if the given error is non-nil the function returns true. 120 IsFailure func(error) bool 121 122 // List of queues to process with given priority value. Keys are the names of the 123 // queues and values are associated priority value. 124 // 125 // If set to nil or not specified, the server will process only the "default" queue. 126 // 127 // Priority is treated as follows to avoid starving low priority queues. 128 // 129 // Example: 130 // 131 // Queues: map[string]int{ 132 // "critical": 6, 133 // "default": 3, 134 // "low": 1, 135 // } 136 // 137 // With the above config and given that all queues are not empty, the tasks 138 // in "critical", "default", "low" should be processed 60%, 30%, 10% of 139 // the time respectively. 140 // 141 // If a queue has a zero or negative priority value, the queue will be ignored. 142 Queues map[string]int 143 144 // StrictPriority indicates whether the queue priority should be treated strictly. 145 // 146 // If set to true, tasks in the queue with the highest priority is processed first. 147 // The tasks in lower priority queues are processed only when those queues with 148 // higher priorities are empty. 149 StrictPriority bool 150 151 // ErrorHandler handles errors returned by the task handler. 152 // 153 // HandleError is invoked only if the task handler returns a non-nil error. 154 // 155 // Example: 156 // 157 // func reportError(ctx context, task *asynq.Task, err error) { 158 // retried, _ := asynq.GetRetryCount(ctx) 159 // maxRetry, _ := asynq.GetMaxRetry(ctx) 160 // if retried >= maxRetry { 161 // err = fmt.Errorf("retry exhausted for task %s: %w", task.Type, err) 162 // } 163 // errorReportingService.Notify(err) 164 // }) 165 // 166 // ErrorHandler: asynq.ErrorHandlerFunc(reportError) 167 168 // we can also handle panic error like: 169 // func reportError(ctx context, task *asynq.Task, err error) { 170 // if asynq.IsPanic(err) { 171 // errorReportingService.Notify(err) 172 // } 173 // }) 174 // 175 // ErrorHandler: asynq.ErrorHandlerFunc(reportError) 176 177 ErrorHandler ErrorHandler 178 179 // Logger specifies the logger used by the server instance. 180 // 181 // If unset, default logger is used. 182 Logger Logger 183 184 // LogLevel specifies the minimum log level to enable. 185 // 186 // If unset, InfoLevel is used by default. 187 LogLevel LogLevel 188 189 // ShutdownTimeout specifies the duration to wait to let workers finish their tasks 190 // before forcing them to abort when stopping the server. 191 // 192 // If unset or zero, default timeout of 8 seconds is used. 193 ShutdownTimeout time.Duration 194 195 // HealthCheckFunc is called periodically with any errors encountered during ping to the 196 // connected redis server. 197 HealthCheckFunc func(error) 198 199 // HealthCheckInterval specifies the interval between healthchecks. 200 // 201 // If unset or zero, the interval is set to 15 seconds. 202 HealthCheckInterval time.Duration 203 204 // DelayedTaskCheckInterval specifies the interval between checks run on 'scheduled' and 'retry' 205 // tasks, and forwarding them to 'pending' state if they are ready to be processed. 206 // 207 // If unset or zero, the interval is set to 5 seconds. 208 DelayedTaskCheckInterval time.Duration 209 210 // GroupGracePeriod specifies the amount of time the server will wait for an incoming task before aggregating 211 // the tasks in a group. If an incoming task is received within this period, the server will wait for another 212 // period of the same length, up to GroupMaxDelay if specified. 213 // 214 // If unset or zero, the grace period is set to 1 minute. 215 // Minimum duration for GroupGracePeriod is 1 second. If value specified is less than a second, the call to 216 // NewServer will panic. 217 GroupGracePeriod time.Duration 218 219 // GroupMaxDelay specifies the maximum amount of time the server will wait for incoming tasks before aggregating 220 // the tasks in a group. 221 // 222 // If unset or zero, no delay limit is used. 223 GroupMaxDelay time.Duration 224 225 // GroupMaxSize specifies the maximum number of tasks that can be aggregated into a single task within a group. 226 // If GroupMaxSize is reached, the server will aggregate the tasks into one immediately. 227 // 228 // If unset or zero, no size limit is used. 229 GroupMaxSize int 230 231 // GroupAggregator specifies the aggregation function used to aggregate multiple tasks in a group into one task. 232 // 233 // If unset or nil, the group aggregation feature will be disabled on the server. 234 GroupAggregator GroupAggregator 235 236 DisableRedisConnClose bool 237 } 238 239 // GroupAggregator aggregates a group of tasks into one before the tasks are passed to the Handler. 240 type GroupAggregator interface { 241 // Aggregate aggregates the given tasks in a group with the given group name, 242 // and returns a new task which is the aggregation of those tasks. 243 // 244 // Use NewTask(typename, payload, opts...) to set any options for the aggregated task. 245 // The Queue option, if provided, will be ignored and the aggregated task will always be enqueued 246 // to the same queue the group belonged. 247 Aggregate(group string, tasks []*Task) *Task 248 } 249 250 // The GroupAggregatorFunc type is an adapter to allow the use of ordinary functions as a GroupAggregator. 251 // If f is a function with the appropriate signature, GroupAggregatorFunc(f) is a GroupAggregator that calls f. 252 type GroupAggregatorFunc func(group string, tasks []*Task) *Task 253 254 // Aggregate calls fn(group, tasks) 255 func (fn GroupAggregatorFunc) Aggregate(group string, tasks []*Task) *Task { 256 return fn(group, tasks) 257 } 258 259 // An ErrorHandler handles an error occurred during task processing. 260 type ErrorHandler interface { 261 HandleError(ctx context.Context, task *Task, err error) 262 } 263 264 // The ErrorHandlerFunc type is an adapter to allow the use of ordinary functions as a ErrorHandler. 265 // If f is a function with the appropriate signature, ErrorHandlerFunc(f) is a ErrorHandler that calls f. 266 type ErrorHandlerFunc func(ctx context.Context, task *Task, err error) 267 268 // HandleError calls fn(ctx, task, err) 269 func (fn ErrorHandlerFunc) HandleError(ctx context.Context, task *Task, err error) { 270 fn(ctx, task, err) 271 } 272 273 // RetryDelayFunc calculates the retry delay duration for a failed task given 274 // the retry count, error, and the task. 275 // 276 // n is the number of times the task has been retried. 277 // e is the error returned by the task handler. 278 // t is the task in question. 279 type RetryDelayFunc func(n int, e error, t *Task) time.Duration 280 281 // Logger supports logging at various log levels. 282 type Logger interface { 283 // Debug logs a message at Debug level. 284 Debug(args ...any) 285 286 // Info logs a message at Info level. 287 Info(args ...any) 288 289 // Warn logs a message at Warning level. 290 Warn(args ...any) 291 292 // Error logs a message at Error level. 293 Error(args ...any) 294 295 // Fatal logs a message at Fatal level 296 // and process will exit with status set to 1. 297 Fatal(args ...any) 298 } 299 300 // LogLevel represents logging level. 301 // 302 // It satisfies flag.Value interface. 303 type LogLevel int32 304 305 const ( 306 // Note: reserving value zero to differentiate unspecified case. 307 level_unspecified LogLevel = iota 308 309 // DebugLevel is the lowest level of logging. 310 // Debug logs are intended for debugging and development purposes. 311 DebugLevel 312 313 // InfoLevel is used for general informational log messages. 314 InfoLevel 315 316 // WarnLevel is used for undesired but relatively expected events, 317 // which may indicate a problem. 318 WarnLevel 319 320 // ErrorLevel is used for undesired and unexpected events that 321 // the program can recover from. 322 ErrorLevel 323 324 // FatalLevel is used for undesired and unexpected events that 325 // the program cannot recover from. 326 FatalLevel 327 ) 328 329 // String is part of the flag.Value interface. 330 func (l *LogLevel) String() string { 331 switch *l { 332 case DebugLevel: 333 return "debug" 334 case InfoLevel: 335 return "info" 336 case WarnLevel: 337 return "warn" 338 case ErrorLevel: 339 return "error" 340 case FatalLevel: 341 return "fatal" 342 } 343 panic(fmt.Sprintf("asynq: unexpected log level: %v", *l)) 344 } 345 346 // Set is part of the flag.Value interface. 347 func (l *LogLevel) Set(val string) error { 348 switch strings.ToLower(val) { 349 case "debug": 350 *l = DebugLevel 351 case "info": 352 *l = InfoLevel 353 case "warn", "warning": 354 *l = WarnLevel 355 case "error": 356 *l = ErrorLevel 357 case "fatal": 358 *l = FatalLevel 359 default: 360 return fmt.Errorf("asynq: unsupported log level %q", val) 361 } 362 return nil 363 } 364 365 func toInternalLogLevel(l LogLevel) log.Level { 366 switch l { 367 case DebugLevel: 368 return log.DebugLevel 369 case InfoLevel: 370 return log.InfoLevel 371 case WarnLevel: 372 return log.WarnLevel 373 case ErrorLevel: 374 return log.ErrorLevel 375 case FatalLevel: 376 return log.FatalLevel 377 } 378 panic(fmt.Sprintf("asynq: unexpected log level: %v", l)) 379 } 380 381 // DefaultRetryDelayFunc is the default RetryDelayFunc used if one is not specified in Config. 382 // It uses exponential back-off strategy to calculate the retry delay. 383 func DefaultRetryDelayFunc(n int, e error, t *Task) time.Duration { 384 r := rand.New(rand.NewSource(time.Now().UnixNano())) 385 // Formula taken from https://github.com/mperham/sidekiq. 386 s := int(math.Pow(float64(n), 4)) + 15 + (r.Intn(30) * (n + 1)) 387 return time.Duration(s) * time.Second 388 } 389 390 func defaultIsFailureFunc(err error) bool { return err != nil } 391 392 var defaultQueueConfig = map[string]int{ 393 base.DefaultQueueName: 1, 394 } 395 396 const ( 397 defaultShutdownTimeout = 8 * time.Second 398 399 defaultHealthCheckInterval = 15 * time.Second 400 401 defaultDelayedTaskCheckInterval = 5 * time.Second 402 403 defaultGroupGracePeriod = 1 * time.Minute 404 ) 405 406 // NewServer returns a new Server given a redis connection option 407 // and server configuration. 408 func NewServer(r RedisConnOpt, cfg Config) *Server { 409 c, ok := r.MakeRedisClient().(redis.UniversalClient) 410 if !ok { 411 panic(fmt.Sprintf("asynq: unsupported RedisConnOpt type %T", r)) 412 } 413 baseCtxFn := cfg.BaseContext 414 if baseCtxFn == nil { 415 baseCtxFn = context.Background 416 } 417 n := cfg.Concurrency 418 if n < 1 { 419 n = runtime.NumCPU() 420 } 421 delayFunc := cfg.RetryDelayFunc 422 if delayFunc == nil { 423 delayFunc = DefaultRetryDelayFunc 424 } 425 isFailureFunc := cfg.IsFailure 426 if isFailureFunc == nil { 427 isFailureFunc = defaultIsFailureFunc 428 } 429 queues := make(map[string]int) 430 for qname, p := range cfg.Queues { 431 if err := base.ValidateQueueName(qname); err != nil { 432 continue // ignore invalid queue names 433 } 434 if p > 0 { 435 queues[qname] = p 436 } 437 } 438 if len(queues) == 0 { 439 queues = defaultQueueConfig 440 } 441 var qnames []string 442 for q := range queues { 443 qnames = append(qnames, q) 444 } 445 shutdownTimeout := cfg.ShutdownTimeout 446 if shutdownTimeout == 0 { 447 shutdownTimeout = defaultShutdownTimeout 448 } 449 healthcheckInterval := cfg.HealthCheckInterval 450 if healthcheckInterval == 0 { 451 healthcheckInterval = defaultHealthCheckInterval 452 } 453 // TODO: Create a helper to check for zero value and fall back to default (e.g. getDurationOrDefault()) 454 groupGracePeriod := cfg.GroupGracePeriod 455 if groupGracePeriod == 0 { 456 groupGracePeriod = defaultGroupGracePeriod 457 } 458 if groupGracePeriod < time.Second { 459 panic("GroupGracePeriod cannot be less than a second") 460 } 461 logger := log.NewLogger(cfg.Logger) 462 loglevel := cfg.LogLevel 463 if loglevel == level_unspecified { 464 loglevel = InfoLevel 465 } 466 logger.SetLevel(toInternalLogLevel(loglevel)) 467 468 rdb := rdb.NewRDB(c) 469 starting := make(chan *workerInfo) 470 finished := make(chan *base.TaskMessage) 471 syncCh := make(chan *syncRequest) 472 srvState := &serverState{value: srvStateNew} 473 cancels := base.NewCancelations() 474 475 syncer := newSyncer(syncerParams{ 476 logger: logger, 477 requestsCh: syncCh, 478 interval: 5 * time.Second, 479 }) 480 heartbeater := newHeartbeater(heartbeaterParams{ 481 logger: logger, 482 broker: rdb, 483 interval: 5 * time.Second, 484 concurrency: n, 485 queues: queues, 486 strictPriority: cfg.StrictPriority, 487 state: srvState, 488 starting: starting, 489 finished: finished, 490 }) 491 delayedTaskCheckInterval := cfg.DelayedTaskCheckInterval 492 if delayedTaskCheckInterval == 0 { 493 delayedTaskCheckInterval = defaultDelayedTaskCheckInterval 494 } 495 forwarder := newForwarder(forwarderParams{ 496 logger: logger, 497 broker: rdb, 498 queues: qnames, 499 interval: delayedTaskCheckInterval, 500 }) 501 subscriber := newSubscriber(subscriberParams{ 502 logger: logger, 503 broker: rdb, 504 cancelations: cancels, 505 }) 506 processor := newProcessor(processorParams{ 507 logger: logger, 508 broker: rdb, 509 retryDelayFunc: delayFunc, 510 baseCtxFn: baseCtxFn, 511 isFailureFunc: isFailureFunc, 512 syncCh: syncCh, 513 cancelations: cancels, 514 concurrency: n, 515 queues: queues, 516 strictPriority: cfg.StrictPriority, 517 errHandler: cfg.ErrorHandler, 518 shutdownTimeout: shutdownTimeout, 519 starting: starting, 520 finished: finished, 521 }) 522 recoverer := newRecoverer(recovererParams{ 523 logger: logger, 524 broker: rdb, 525 retryDelayFunc: delayFunc, 526 isFailureFunc: isFailureFunc, 527 queues: qnames, 528 interval: 1 * time.Minute, 529 }) 530 healthchecker := newHealthChecker(healthcheckerParams{ 531 logger: logger, 532 broker: rdb, 533 interval: healthcheckInterval, 534 healthcheckFunc: cfg.HealthCheckFunc, 535 }) 536 janitor := newJanitor(janitorParams{ 537 logger: logger, 538 broker: rdb, 539 queues: qnames, 540 interval: 8 * time.Second, 541 }) 542 aggregator := newAggregator(aggregatorParams{ 543 logger: logger, 544 broker: rdb, 545 queues: qnames, 546 gracePeriod: groupGracePeriod, 547 maxDelay: cfg.GroupMaxDelay, 548 maxSize: cfg.GroupMaxSize, 549 groupAggregator: cfg.GroupAggregator, 550 }) 551 return &Server{ 552 logger: logger, 553 broker: rdb, 554 state: srvState, 555 disableRedisConnClose: cfg.DisableRedisConnClose, 556 forwarder: forwarder, 557 processor: processor, 558 syncer: syncer, 559 heartbeater: heartbeater, 560 subscriber: subscriber, 561 recoverer: recoverer, 562 healthchecker: healthchecker, 563 janitor: janitor, 564 aggregator: aggregator, 565 } 566 } 567 568 // A Handler processes tasks. 569 // 570 // ProcessTask should return nil if the processing of a task 571 // is successful. 572 // 573 // If ProcessTask returns a non-nil error or panics, the task 574 // will be retried after delay if retry-count is remaining, 575 // otherwise the task will be archived. 576 // 577 // One exception to this rule is when ProcessTask returns a SkipRetry error. 578 // If the returned error is SkipRetry or an error wraps SkipRetry, retry is 579 // skipped and the task will be immediately archived instead. 580 type Handler interface { 581 ProcessTask(context.Context, *Task) error 582 } 583 584 // The HandlerFunc type is an adapter to allow the use of 585 // ordinary functions as a Handler. If f is a function 586 // with the appropriate signature, HandlerFunc(f) is a 587 // Handler that calls f. 588 type HandlerFunc func(context.Context, *Task) error 589 590 // ProcessTask calls fn(ctx, task) 591 func (fn HandlerFunc) ProcessTask(ctx context.Context, task *Task) error { 592 return fn(ctx, task) 593 } 594 595 // ErrServerClosed indicates that the operation is now illegal because of the server has been shutdown. 596 var ErrServerClosed = errors.New("asynq: Server closed") 597 598 // Run starts the task processing and blocks until 599 // an os signal to exit the program is received. Once it receives 600 // a signal, it gracefully shuts down all active workers and other 601 // goroutines to process the tasks. 602 // 603 // Run returns any error encountered at server startup time. 604 // If the server has already been shutdown, ErrServerClosed is returned. 605 func (srv *Server) Run(handler Handler) error { 606 if err := srv.Start(handler); err != nil { 607 return err 608 } 609 srv.waitForSignals() 610 srv.Shutdown() 611 return nil 612 } 613 614 // Start starts the worker server. Once the server has started, 615 // it pulls tasks off queues and starts a worker goroutine for each task 616 // and then call Handler to process it. 617 // Tasks are processed concurrently by the workers up to the number of 618 // concurrency specified in Config.Concurrency. 619 // 620 // Start returns any error encountered at server startup time. 621 // If the server has already been shutdown, ErrServerClosed is returned. 622 func (srv *Server) Start(handler Handler) error { 623 if handler == nil { 624 return fmt.Errorf("asynq: server cannot run with nil handler") 625 } 626 srv.processor.handler = handler 627 628 if err := srv.start(); err != nil { 629 return err 630 } 631 srv.logger.Info("[Common] asynq starting processing") 632 633 srv.heartbeater.start(&srv.wg) 634 srv.healthchecker.start(&srv.wg) 635 srv.subscriber.start(&srv.wg) 636 srv.syncer.start(&srv.wg) 637 srv.recoverer.start(&srv.wg) 638 srv.forwarder.start(&srv.wg) 639 srv.processor.start(&srv.wg) 640 srv.janitor.start(&srv.wg) 641 srv.aggregator.start(&srv.wg) 642 return nil 643 } 644 645 // Checks server state and returns an error if pre-condition is not met. 646 // Otherwise it sets the server state to active. 647 func (srv *Server) start() error { 648 srv.state.mu.Lock() 649 defer srv.state.mu.Unlock() 650 switch srv.state.value { 651 case srvStateActive: 652 return fmt.Errorf("asynq: the server is already running") 653 case srvStateStopped: 654 return fmt.Errorf("asynq: the server is in the stopped state. Waiting for shutdown.") 655 case srvStateClosed: 656 return ErrServerClosed 657 } 658 srv.state.value = srvStateActive 659 return nil 660 } 661 662 // Shutdown gracefully shuts down the server. 663 // It gracefully closes all active workers. The server will wait for 664 // active workers to finish processing tasks for duration specified in Config.ShutdownTimeout. 665 // If worker didn't finish processing a task during the timeout, the task will be pushed back to Redis. 666 func (srv *Server) Shutdown() { 667 srv.state.mu.Lock() 668 if srv.state.value == srvStateNew || srv.state.value == srvStateClosed { 669 srv.state.mu.Unlock() 670 // server is not running, do nothing and return. 671 return 672 } 673 srv.state.value = srvStateClosed 674 srv.state.mu.Unlock() 675 676 srv.logger.Info("[Common] asynq starting graceful shutdown") 677 // Note: The order of shutdown is important. 678 // Sender goroutines should be terminated before the receiver goroutines. 679 // processor -> syncer (via syncCh) 680 // processor -> heartbeater (via starting, finished channels) 681 srv.forwarder.shutdown() 682 srv.processor.shutdown() 683 srv.recoverer.shutdown() 684 srv.syncer.shutdown() 685 srv.subscriber.shutdown() 686 srv.janitor.shutdown() 687 srv.aggregator.shutdown() 688 srv.healthchecker.shutdown() 689 srv.heartbeater.shutdown() 690 srv.wg.Wait() 691 692 if !srv.disableRedisConnClose { 693 _ = srv.broker.Close() 694 } 695 srv.logger.Info("[Common] asynq exiting") 696 } 697 698 // Stop signals the server to stop pulling new tasks off queues. 699 // Stop can be used before shutting down the server to ensure that all 700 // currently active tasks are processed before server shutdown. 701 // 702 // Stop does not shutdown the server, make sure to call Shutdown before exit. 703 func (srv *Server) Stop() { 704 srv.state.mu.Lock() 705 if srv.state.value != srvStateActive { 706 // Invalid calll to Stop, server can only go from Active state to Stopped state. 707 srv.state.mu.Unlock() 708 return 709 } 710 srv.state.value = srvStateStopped 711 srv.state.mu.Unlock() 712 713 srv.logger.Info("[Common] asynq stopping processor") 714 srv.processor.stop() 715 srv.logger.Info("[Common] asynq processor stopped") 716 }