github.com/wfusion/gofusion@v1.1.14/common/infra/asynq/processor.go (about) 1 // Copyright 2020 Kentaro Hibino. All rights reserved. 2 // Use of this source code is governed by a MIT license 3 // that can be found in the LICENSE file. 4 5 package asynq 6 7 import ( 8 "context" 9 "fmt" 10 "math" 11 "math/rand" 12 "runtime" 13 "runtime/debug" 14 "sort" 15 "strings" 16 "sync" 17 "time" 18 19 "github.com/wfusion/gofusion/common/infra/asynq/pkg/base" 20 asynqcontext "github.com/wfusion/gofusion/common/infra/asynq/pkg/context" 21 "github.com/wfusion/gofusion/common/infra/asynq/pkg/errors" 22 "github.com/wfusion/gofusion/common/infra/asynq/pkg/log" 23 "github.com/wfusion/gofusion/common/infra/asynq/pkg/timeutil" 24 "golang.org/x/time/rate" 25 ) 26 27 type processor struct { 28 logger *log.Logger 29 broker base.Broker 30 clock timeutil.Clock 31 32 handler Handler 33 baseCtxFn func() context.Context 34 35 queueConfig map[string]int 36 37 // orderedQueues is set only in strict-priority mode. 38 orderedQueues []string 39 40 retryDelayFunc RetryDelayFunc 41 isFailureFunc func(error) bool 42 43 errHandler ErrorHandler 44 shutdownTimeout time.Duration 45 46 // channel via which to send sync requests to syncer. 47 syncRequestCh chan<- *syncRequest 48 49 // rate limiter to prevent spamming logs with a bunch of errors. 50 errLogLimiter *rate.Limiter 51 52 // sema is a counting semaphore to ensure the number of active workers 53 // does not exceed the limit. 54 sema chan struct{} 55 56 // channel to communicate back to the long running "processor" goroutine. 57 // once is used to send value to the channel only once. 58 done chan struct{} 59 once sync.Once 60 61 // quit channel is closed when the shutdown of the "processor" goroutine starts. 62 quit chan struct{} 63 64 // abort channel communicates to the in-flight worker goroutines to stop. 65 abort chan struct{} 66 67 // cancelations is a set of cancel functions for all active tasks. 68 cancelations *base.Cancelations 69 70 starting chan<- *workerInfo 71 finished chan<- *base.TaskMessage 72 } 73 74 type processorParams struct { 75 logger *log.Logger 76 broker base.Broker 77 baseCtxFn func() context.Context 78 retryDelayFunc RetryDelayFunc 79 isFailureFunc func(error) bool 80 syncCh chan<- *syncRequest 81 cancelations *base.Cancelations 82 concurrency int 83 queues map[string]int 84 strictPriority bool 85 errHandler ErrorHandler 86 shutdownTimeout time.Duration 87 starting chan<- *workerInfo 88 finished chan<- *base.TaskMessage 89 } 90 91 // newProcessor constructs a new processor. 92 func newProcessor(params processorParams) *processor { 93 queues := normalizeQueues(params.queues) 94 orderedQueues := []string(nil) 95 if params.strictPriority { 96 orderedQueues = sortByPriority(queues) 97 } 98 return &processor{ 99 logger: params.logger, 100 broker: params.broker, 101 baseCtxFn: params.baseCtxFn, 102 clock: timeutil.NewRealClock(), 103 queueConfig: queues, 104 orderedQueues: orderedQueues, 105 retryDelayFunc: params.retryDelayFunc, 106 isFailureFunc: params.isFailureFunc, 107 syncRequestCh: params.syncCh, 108 cancelations: params.cancelations, 109 errLogLimiter: rate.NewLimiter(rate.Every(3*time.Second), 1), 110 sema: make(chan struct{}, params.concurrency), 111 done: make(chan struct{}), 112 quit: make(chan struct{}), 113 abort: make(chan struct{}), 114 errHandler: params.errHandler, 115 handler: HandlerFunc(func(ctx context.Context, t *Task) error { return fmt.Errorf("handler not set") }), 116 shutdownTimeout: params.shutdownTimeout, 117 starting: params.starting, 118 finished: params.finished, 119 } 120 } 121 122 // Note: stops only the "processor" goroutine, does not stop workers. 123 // It's safe to call this method multiple times. 124 func (p *processor) stop() { 125 p.once.Do(func() { 126 p.logger.Debug("[Common] asynq processor shutting down...") 127 // Unblock if processor is waiting for sema token. 128 close(p.quit) 129 // Signal the processor goroutine to stop processing tasks 130 // from the queue. 131 p.done <- struct{}{} 132 }) 133 } 134 135 // NOTE: once shutdown, processor cannot be re-started. 136 func (p *processor) shutdown() { 137 p.stop() 138 139 time.AfterFunc(p.shutdownTimeout, func() { close(p.abort) }) 140 141 p.logger.Info("[Common] asynq waiting for all workers to finish...") 142 // block until all workers have released the token 143 for i := 0; i < cap(p.sema); i++ { 144 p.sema <- struct{}{} 145 } 146 p.logger.Info("[Common] asynq all workers have finished") 147 } 148 149 func (p *processor) start(wg *sync.WaitGroup) { 150 wg.Add(1) 151 go func() { 152 defer wg.Done() 153 for { 154 select { 155 case <-p.done: 156 p.logger.Debug("[Common] asynq processor done") 157 return 158 default: 159 p.exec() 160 } 161 } 162 }() 163 } 164 165 // exec pulls a task out of the queue and starts a worker goroutine to 166 // process the task. 167 func (p *processor) exec() { 168 select { 169 case <-p.quit: 170 return 171 case p.sema <- struct{}{}: // acquire token 172 qnames := p.queues() 173 msg, leaseExpirationTime, err := p.broker.Dequeue(qnames...) 174 switch { 175 case errors.Is(err, errors.ErrNoProcessableTask): 176 p.logger.Debug("[Common] asynq all queues are empty") 177 // Queues are empty, this is a normal behavior. 178 // Sleep to avoid slamming redis and let scheduler move tasks into queues. 179 // Note: We are not using blocking pop operation and polling queues instead. 180 // This adds significant load to redis. 181 time.Sleep(time.Second) 182 <-p.sema // release token 183 return 184 case err != nil: 185 if p.errLogLimiter.Allow() { 186 p.logger.Errorf("[Common] asynq dequeue error: %v", err) 187 } 188 <-p.sema // release token 189 return 190 } 191 192 lease := base.NewLease(leaseExpirationTime) 193 deadline := p.computeDeadline(msg) 194 p.starting <- &workerInfo{msg, time.Now(), deadline, lease} 195 go func() { 196 defer func() { 197 p.finished <- msg 198 <-p.sema // release token 199 }() 200 201 ctx, cancel := asynqcontext.New(p.baseCtxFn(), msg, deadline) 202 p.cancelations.Add(msg.ID, cancel) 203 defer func() { 204 cancel() 205 p.cancelations.Delete(msg.ID) 206 }() 207 208 // check context before starting a worker goroutine. 209 select { 210 case <-ctx.Done(): 211 // already canceled (e.g. deadline exceeded). 212 p.handleFailedMessage(ctx, lease, msg, ctx.Err()) 213 return 214 default: 215 } 216 217 resCh := make(chan error, 1) 218 go func() { 219 task := newTask( 220 msg.Type, 221 msg.Payload, 222 &ResultWriter{ 223 id: msg.ID, 224 qname: msg.Queue, 225 broker: p.broker, 226 ctx: ctx, 227 }, 228 ) 229 resCh <- p.perform(ctx, task) 230 }() 231 232 select { 233 case <-p.abort: 234 // time is up, push the message back to queue and quit this worker goroutine. 235 p.logger.Warnf("[Common] asynq quitting worker. task id=%s", msg.ID) 236 p.requeue(lease, msg) 237 return 238 case <-lease.Done(): 239 cancel() 240 p.handleFailedMessage(ctx, lease, msg, ErrLeaseExpired) 241 return 242 case <-ctx.Done(): 243 p.handleFailedMessage(ctx, lease, msg, ctx.Err()) 244 return 245 case resErr := <-resCh: 246 if resErr != nil { 247 p.handleFailedMessage(ctx, lease, msg, resErr) 248 return 249 } 250 p.handleSucceededMessage(lease, msg) 251 } 252 }() 253 } 254 } 255 256 func (p *processor) requeue(l *base.Lease, msg *base.TaskMessage) { 257 if !l.IsValid() { 258 // If lease is not valid, do not write to redis; Let recoverer take care of it. 259 return 260 } 261 ctx, _ := context.WithDeadline(context.Background(), l.Deadline()) 262 err := p.broker.Requeue(ctx, msg) 263 if err != nil { 264 p.logger.Errorf("[Common] asynq could not push task id=%s back to queue: %v", msg.ID, err) 265 } else { 266 p.logger.Infof("[Common] asynq pushed task id=%s back to queue", msg.ID) 267 } 268 } 269 270 func (p *processor) handleSucceededMessage(l *base.Lease, msg *base.TaskMessage) { 271 if msg.Retention > 0 { 272 p.markAsComplete(l, msg) 273 } else { 274 p.markAsDone(l, msg) 275 } 276 } 277 278 func (p *processor) markAsComplete(l *base.Lease, msg *base.TaskMessage) { 279 if !l.IsValid() { 280 // If lease is not valid, do not write to redis; Let recoverer take care of it. 281 return 282 } 283 ctx, _ := context.WithDeadline(context.Background(), l.Deadline()) 284 err := p.broker.MarkAsComplete(ctx, msg) 285 if err != nil { 286 errMsg := fmt.Sprintf("Could not move task id=%s type=%q from %q to %q: %+v", 287 msg.ID, msg.Type, base.ActiveKey(msg.Queue), base.CompletedKey(msg.Queue), err) 288 p.logger.Warnf("[Common] asynq %s; will retry syncing", errMsg) 289 p.syncRequestCh <- &syncRequest{ 290 fn: func() error { 291 return p.broker.MarkAsComplete(ctx, msg) 292 }, 293 errMsg: errMsg, 294 deadline: l.Deadline(), 295 } 296 } 297 } 298 299 func (p *processor) markAsDone(l *base.Lease, msg *base.TaskMessage) { 300 if !l.IsValid() { 301 // If lease is not valid, do not write to redis; Let recoverer take care of it. 302 return 303 } 304 ctx, _ := context.WithDeadline(context.Background(), l.Deadline()) 305 err := p.broker.Done(ctx, msg) 306 if err != nil { 307 errMsg := fmt.Sprintf("Could not remove task id=%s type=%q from %q err: %+v", 308 msg.ID, msg.Type, base.ActiveKey(msg.Queue), err) 309 p.logger.Warnf("[Common] asynq %s; will retry syncing", errMsg) 310 p.syncRequestCh <- &syncRequest{ 311 fn: func() error { 312 return p.broker.Done(ctx, msg) 313 }, 314 errMsg: errMsg, 315 deadline: l.Deadline(), 316 } 317 } 318 } 319 320 // SkipRetry is used as a return value from Handler.ProcessTask to indicate that 321 // the task should not be retried and should be archived instead. 322 var SkipRetry = errors.New("skip retry for the task") 323 324 func (p *processor) handleFailedMessage(ctx context.Context, l *base.Lease, msg *base.TaskMessage, err error) { 325 if p.errHandler != nil { 326 p.errHandler.HandleError(ctx, NewTask(msg.Type, msg.Payload), err) 327 } 328 if !p.isFailureFunc(err) { 329 // retry the task without marking it as failed 330 p.retry(l, msg, err, false /* isFailure */) 331 return 332 } 333 if msg.Retried >= msg.Retry || errors.Is(err, SkipRetry) { 334 p.logger.Warnf("[Common] asynq retry exhausted for task id=%s", msg.ID) 335 p.archive(l, msg, err) 336 } else { 337 p.retry(l, msg, err, true /* isFailure */) 338 } 339 } 340 341 func (p *processor) retry(l *base.Lease, msg *base.TaskMessage, e error, isFailure bool) { 342 if !l.IsValid() { 343 // If lease is not valid, do not write to redis; Let recoverer take care of it. 344 return 345 } 346 ctx, _ := context.WithDeadline(context.Background(), l.Deadline()) 347 d := p.retryDelayFunc(msg.Retried, e, NewTask(msg.Type, msg.Payload)) 348 retryAt := time.Now().Add(d) 349 err := p.broker.Retry(ctx, msg, retryAt, e.Error(), isFailure) 350 if err != nil { 351 errMsg := fmt.Sprintf("Could not move task id=%s from %q to %q", 352 msg.ID, base.ActiveKey(msg.Queue), base.RetryKey(msg.Queue)) 353 p.logger.Warnf("[Common] asynq %s; will retry syncing", errMsg) 354 p.syncRequestCh <- &syncRequest{ 355 fn: func() error { 356 return p.broker.Retry(ctx, msg, retryAt, e.Error(), isFailure) 357 }, 358 errMsg: errMsg, 359 deadline: l.Deadline(), 360 } 361 } 362 } 363 364 func (p *processor) archive(l *base.Lease, msg *base.TaskMessage, e error) { 365 if !l.IsValid() { 366 // If lease is not valid, do not write to redis; Let recoverer take care of it. 367 return 368 } 369 ctx, _ := context.WithDeadline(context.Background(), l.Deadline()) 370 err := p.broker.Archive(ctx, msg, e.Error()) 371 if err != nil { 372 errMsg := fmt.Sprintf("Could not move task id=%s from %q to %q", 373 msg.ID, base.ActiveKey(msg.Queue), base.ArchivedKey(msg.Queue)) 374 p.logger.Warnf("[Common] asynq %s; will retry syncing", errMsg) 375 p.syncRequestCh <- &syncRequest{ 376 fn: func() error { 377 return p.broker.Archive(ctx, msg, e.Error()) 378 }, 379 errMsg: errMsg, 380 deadline: l.Deadline(), 381 } 382 } 383 } 384 385 // queues returns a list of queues to query. 386 // Order of the queue names is based on the priority of each queue. 387 // Queue names is sorted by their priority level if strict-priority is true. 388 // If strict-priority is false, then the order of queue names are roughly based on 389 // the priority level but randomized in order to avoid starving low priority queues. 390 func (p *processor) queues() []string { 391 // skip the overhead of generating a list of queue names 392 // if we are processing one queue. 393 if len(p.queueConfig) == 1 { 394 for qname := range p.queueConfig { 395 return []string{qname} 396 } 397 } 398 if p.orderedQueues != nil { 399 return p.orderedQueues 400 } 401 var names []string 402 for qname, priority := range p.queueConfig { 403 for i := 0; i < priority; i++ { 404 names = append(names, qname) 405 } 406 } 407 r := rand.New(rand.NewSource(time.Now().UnixNano())) 408 r.Shuffle(len(names), func(i, j int) { names[i], names[j] = names[j], names[i] }) 409 return uniq(names, len(p.queueConfig)) 410 } 411 412 // perform calls the handler with the given task. 413 // If the call returns without panic, it simply returns the value, 414 // otherwise, it recovers from panic and returns an error. 415 func (p *processor) perform(ctx context.Context, task *Task) (err error) { 416 defer func() { 417 if x := recover(); x != nil { 418 errMsg := string(debug.Stack()) 419 420 p.logger.Errorf("[Common] asynq recovering from panic: %s, See the stack trace below for details:\n%s", 421 x, errMsg) 422 _, file, line, ok := runtime.Caller(1) // skip the first frame (panic itself) 423 if ok && strings.Contains(file, "runtime/") { 424 // The panic came from the runtime, most likely due to incorrect 425 // map/slice usage. The parent frame should have the real trigger. 426 _, file, line, ok = runtime.Caller(2) 427 } 428 429 // Include the file and line number info in the error, if runtime.Caller returned ok. 430 if ok { 431 err = fmt.Errorf("panic [%s:%d]: %v", file, line, x) 432 } else { 433 err = fmt.Errorf("panic: %v", x) 434 } 435 err = &errors.PanicError{ 436 ErrMsg: errMsg, 437 } 438 } 439 }() 440 return p.handler.ProcessTask(ctx, task) 441 } 442 443 // uniq dedupes elements and returns a slice of unique names of length l. 444 // Order of the output slice is based on the input list. 445 func uniq(names []string, l int) []string { 446 var res []string 447 seen := make(map[string]struct{}) 448 for _, s := range names { 449 if _, ok := seen[s]; !ok { 450 seen[s] = struct{}{} 451 res = append(res, s) 452 } 453 if len(res) == l { 454 break 455 } 456 } 457 return res 458 } 459 460 // sortByPriority returns a list of queue names sorted by 461 // their priority level in descending order. 462 func sortByPriority(qcfg map[string]int) []string { 463 var queues []*queue 464 for qname, n := range qcfg { 465 queues = append(queues, &queue{qname, n}) 466 } 467 sort.Sort(sort.Reverse(byPriority(queues))) 468 var res []string 469 for _, q := range queues { 470 res = append(res, q.name) 471 } 472 return res 473 } 474 475 type queue struct { 476 name string 477 priority int 478 } 479 480 type byPriority []*queue 481 482 func (x byPriority) Len() int { return len(x) } 483 func (x byPriority) Less(i, j int) bool { return x[i].priority < x[j].priority } 484 func (x byPriority) Swap(i, j int) { x[i], x[j] = x[j], x[i] } 485 486 // normalizeQueues divides priority numbers by their greatest common divisor. 487 func normalizeQueues(queues map[string]int) map[string]int { 488 var xs []int 489 for _, x := range queues { 490 xs = append(xs, x) 491 } 492 d := gcd(xs...) 493 res := make(map[string]int) 494 for q, x := range queues { 495 res[q] = x / d 496 } 497 return res 498 } 499 500 func gcd(xs ...int) int { 501 fn := func(x, y int) int { 502 for y > 0 { 503 x, y = y, x%y 504 } 505 return x 506 } 507 res := xs[0] 508 for i := 0; i < len(xs); i++ { 509 res = fn(xs[i], res) 510 if res == 1 { 511 return 1 512 } 513 } 514 return res 515 } 516 517 // computeDeadline returns the given task's deadline, 518 func (p *processor) computeDeadline(msg *base.TaskMessage) time.Time { 519 if msg.Timeout == 0 && msg.Deadline == 0 { 520 p.logger.Errorf("[Common] asynq: internal error: "+ 521 "both timeout and deadline are not set for the task message: %s", msg.ID) 522 return p.clock.Now().Add(defaultTimeout) 523 } 524 if msg.Timeout != 0 && msg.Deadline != 0 { 525 deadlineUnix := math.Min(float64(p.clock.Now().Unix()+msg.Timeout), float64(msg.Deadline)) 526 return time.Unix(int64(deadlineUnix), 0) 527 } 528 if msg.Timeout != 0 { 529 return p.clock.Now().Add(time.Duration(msg.Timeout) * time.Second) 530 } 531 return time.Unix(msg.Deadline, 0) 532 } 533 534 func IsPanicError(err error) bool { 535 return errors.IsPanicError(err) 536 }