github.com/wfusion/gofusion@v1.1.14/common/infra/asynq/processor.go

github.com/wfusion/gofusion@v1.1.14/common/infra/asynq/processor.go (about)

     1  // Copyright 2020 Kentaro Hibino. All rights reserved.
     2  // Use of this source code is governed by a MIT license
     3  // that can be found in the LICENSE file.
     4  
     5  package asynq
     6  
     7  import (
     8  	"context"
     9  	"fmt"
    10  	"math"
    11  	"math/rand"
    12  	"runtime"
    13  	"runtime/debug"
    14  	"sort"
    15  	"strings"
    16  	"sync"
    17  	"time"
    18  
    19  	"github.com/wfusion/gofusion/common/infra/asynq/pkg/base"
    20  	asynqcontext "github.com/wfusion/gofusion/common/infra/asynq/pkg/context"
    21  	"github.com/wfusion/gofusion/common/infra/asynq/pkg/errors"
    22  	"github.com/wfusion/gofusion/common/infra/asynq/pkg/log"
    23  	"github.com/wfusion/gofusion/common/infra/asynq/pkg/timeutil"
    24  	"golang.org/x/time/rate"
    25  )
    26  
    27  type processor struct {
    28  	logger *log.Logger
    29  	broker base.Broker
    30  	clock  timeutil.Clock
    31  
    32  	handler   Handler
    33  	baseCtxFn func() context.Context
    34  
    35  	queueConfig map[string]int
    36  
    37  	// orderedQueues is set only in strict-priority mode.
    38  	orderedQueues []string
    39  
    40  	retryDelayFunc RetryDelayFunc
    41  	isFailureFunc  func(error) bool
    42  
    43  	errHandler      ErrorHandler
    44  	shutdownTimeout time.Duration
    45  
    46  	// channel via which to send sync requests to syncer.
    47  	syncRequestCh chan<- *syncRequest
    48  
    49  	// rate limiter to prevent spamming logs with a bunch of errors.
    50  	errLogLimiter *rate.Limiter
    51  
    52  	// sema is a counting semaphore to ensure the number of active workers
    53  	// does not exceed the limit.
    54  	sema chan struct{}
    55  
    56  	// channel to communicate back to the long running "processor" goroutine.
    57  	// once is used to send value to the channel only once.
    58  	done chan struct{}
    59  	once sync.Once
    60  
    61  	// quit channel is closed when the shutdown of the "processor" goroutine starts.
    62  	quit chan struct{}
    63  
    64  	// abort channel communicates to the in-flight worker goroutines to stop.
    65  	abort chan struct{}
    66  
    67  	// cancelations is a set of cancel functions for all active tasks.
    68  	cancelations *base.Cancelations
    69  
    70  	starting chan<- *workerInfo
    71  	finished chan<- *base.TaskMessage
    72  }
    73  
    74  type processorParams struct {
    75  	logger          *log.Logger
    76  	broker          base.Broker
    77  	baseCtxFn       func() context.Context
    78  	retryDelayFunc  RetryDelayFunc
    79  	isFailureFunc   func(error) bool
    80  	syncCh          chan<- *syncRequest
    81  	cancelations    *base.Cancelations
    82  	concurrency     int
    83  	queues          map[string]int
    84  	strictPriority  bool
    85  	errHandler      ErrorHandler
    86  	shutdownTimeout time.Duration
    87  	starting        chan<- *workerInfo
    88  	finished        chan<- *base.TaskMessage
    89  }
    90  
    91  // newProcessor constructs a new processor.
    92  func newProcessor(params processorParams) *processor {
    93  	queues := normalizeQueues(params.queues)
    94  	orderedQueues := []string(nil)
    95  	if params.strictPriority {
    96  		orderedQueues = sortByPriority(queues)
    97  	}
    98  	return &processor{
    99  		logger:          params.logger,
   100  		broker:          params.broker,
   101  		baseCtxFn:       params.baseCtxFn,
   102  		clock:           timeutil.NewRealClock(),
   103  		queueConfig:     queues,
   104  		orderedQueues:   orderedQueues,
   105  		retryDelayFunc:  params.retryDelayFunc,
   106  		isFailureFunc:   params.isFailureFunc,
   107  		syncRequestCh:   params.syncCh,
   108  		cancelations:    params.cancelations,
   109  		errLogLimiter:   rate.NewLimiter(rate.Every(3*time.Second), 1),
   110  		sema:            make(chan struct{}, params.concurrency),
   111  		done:            make(chan struct{}),
   112  		quit:            make(chan struct{}),
   113  		abort:           make(chan struct{}),
   114  		errHandler:      params.errHandler,
   115  		handler:         HandlerFunc(func(ctx context.Context, t *Task) error { return fmt.Errorf("handler not set") }),
   116  		shutdownTimeout: params.shutdownTimeout,
   117  		starting:        params.starting,
   118  		finished:        params.finished,
   119  	}
   120  }
   121  
   122  // Note: stops only the "processor" goroutine, does not stop workers.
   123  // It's safe to call this method multiple times.
   124  func (p *processor) stop() {
   125  	p.once.Do(func() {
   126  		p.logger.Debug("[Common] asynq processor shutting down...")
   127  		// Unblock if processor is waiting for sema token.
   128  		close(p.quit)
   129  		// Signal the processor goroutine to stop processing tasks
   130  		// from the queue.
   131  		p.done <- struct{}{}
   132  	})
   133  }
   134  
   135  // NOTE: once shutdown, processor cannot be re-started.
   136  func (p *processor) shutdown() {
   137  	p.stop()
   138  
   139  	time.AfterFunc(p.shutdownTimeout, func() { close(p.abort) })
   140  
   141  	p.logger.Info("[Common] asynq waiting for all workers to finish...")
   142  	// block until all workers have released the token
   143  	for i := 0; i < cap(p.sema); i++ {
   144  		p.sema <- struct{}{}
   145  	}
   146  	p.logger.Info("[Common] asynq all workers have finished")
   147  }
   148  
   149  func (p *processor) start(wg *sync.WaitGroup) {
   150  	wg.Add(1)
   151  	go func() {
   152  		defer wg.Done()
   153  		for {
   154  			select {
   155  			case <-p.done:
   156  				p.logger.Debug("[Common] asynq processor done")
   157  				return
   158  			default:
   159  				p.exec()
   160  			}
   161  		}
   162  	}()
   163  }
   164  
   165  // exec pulls a task out of the queue and starts a worker goroutine to
   166  // process the task.
   167  func (p *processor) exec() {
   168  	select {
   169  	case <-p.quit:
   170  		return
   171  	case p.sema <- struct{}{}: // acquire token
   172  		qnames := p.queues()
   173  		msg, leaseExpirationTime, err := p.broker.Dequeue(qnames...)
   174  		switch {
   175  		case errors.Is(err, errors.ErrNoProcessableTask):
   176  			p.logger.Debug("[Common] asynq all queues are empty")
   177  			// Queues are empty, this is a normal behavior.
   178  			// Sleep to avoid slamming redis and let scheduler move tasks into queues.
   179  			// Note: We are not using blocking pop operation and polling queues instead.
   180  			// This adds significant load to redis.
   181  			time.Sleep(time.Second)
   182  			<-p.sema // release token
   183  			return
   184  		case err != nil:
   185  			if p.errLogLimiter.Allow() {
   186  				p.logger.Errorf("[Common] asynq dequeue error: %v", err)
   187  			}
   188  			<-p.sema // release token
   189  			return
   190  		}
   191  
   192  		lease := base.NewLease(leaseExpirationTime)
   193  		deadline := p.computeDeadline(msg)
   194  		p.starting <- &workerInfo{msg, time.Now(), deadline, lease}
   195  		go func() {
   196  			defer func() {
   197  				p.finished <- msg
   198  				<-p.sema // release token
   199  			}()
   200  
   201  			ctx, cancel := asynqcontext.New(p.baseCtxFn(), msg, deadline)
   202  			p.cancelations.Add(msg.ID, cancel)
   203  			defer func() {
   204  				cancel()
   205  				p.cancelations.Delete(msg.ID)
   206  			}()
   207  
   208  			// check context before starting a worker goroutine.
   209  			select {
   210  			case <-ctx.Done():
   211  				// already canceled (e.g. deadline exceeded).
   212  				p.handleFailedMessage(ctx, lease, msg, ctx.Err())
   213  				return
   214  			default:
   215  			}
   216  
   217  			resCh := make(chan error, 1)
   218  			go func() {
   219  				task := newTask(
   220  					msg.Type,
   221  					msg.Payload,
   222  					&ResultWriter{
   223  						id:     msg.ID,
   224  						qname:  msg.Queue,
   225  						broker: p.broker,
   226  						ctx:    ctx,
   227  					},
   228  				)
   229  				resCh <- p.perform(ctx, task)
   230  			}()
   231  
   232  			select {
   233  			case <-p.abort:
   234  				// time is up, push the message back to queue and quit this worker goroutine.
   235  				p.logger.Warnf("[Common] asynq quitting worker. task id=%s", msg.ID)
   236  				p.requeue(lease, msg)
   237  				return
   238  			case <-lease.Done():
   239  				cancel()
   240  				p.handleFailedMessage(ctx, lease, msg, ErrLeaseExpired)
   241  				return
   242  			case <-ctx.Done():
   243  				p.handleFailedMessage(ctx, lease, msg, ctx.Err())
   244  				return
   245  			case resErr := <-resCh:
   246  				if resErr != nil {
   247  					p.handleFailedMessage(ctx, lease, msg, resErr)
   248  					return
   249  				}
   250  				p.handleSucceededMessage(lease, msg)
   251  			}
   252  		}()
   253  	}
   254  }
   255  
   256  func (p *processor) requeue(l *base.Lease, msg *base.TaskMessage) {
   257  	if !l.IsValid() {
   258  		// If lease is not valid, do not write to redis; Let recoverer take care of it.
   259  		return
   260  	}
   261  	ctx, _ := context.WithDeadline(context.Background(), l.Deadline())
   262  	err := p.broker.Requeue(ctx, msg)
   263  	if err != nil {
   264  		p.logger.Errorf("[Common] asynq could not push task id=%s back to queue: %v", msg.ID, err)
   265  	} else {
   266  		p.logger.Infof("[Common] asynq pushed task id=%s back to queue", msg.ID)
   267  	}
   268  }
   269  
   270  func (p *processor) handleSucceededMessage(l *base.Lease, msg *base.TaskMessage) {
   271  	if msg.Retention > 0 {
   272  		p.markAsComplete(l, msg)
   273  	} else {
   274  		p.markAsDone(l, msg)
   275  	}
   276  }
   277  
   278  func (p *processor) markAsComplete(l *base.Lease, msg *base.TaskMessage) {
   279  	if !l.IsValid() {
   280  		// If lease is not valid, do not write to redis; Let recoverer take care of it.
   281  		return
   282  	}
   283  	ctx, _ := context.WithDeadline(context.Background(), l.Deadline())
   284  	err := p.broker.MarkAsComplete(ctx, msg)
   285  	if err != nil {
   286  		errMsg := fmt.Sprintf("Could not move task id=%s type=%q from %q to %q:  %+v",
   287  			msg.ID, msg.Type, base.ActiveKey(msg.Queue), base.CompletedKey(msg.Queue), err)
   288  		p.logger.Warnf("[Common] asynq %s; will retry syncing", errMsg)
   289  		p.syncRequestCh <- &syncRequest{
   290  			fn: func() error {
   291  				return p.broker.MarkAsComplete(ctx, msg)
   292  			},
   293  			errMsg:   errMsg,
   294  			deadline: l.Deadline(),
   295  		}
   296  	}
   297  }
   298  
   299  func (p *processor) markAsDone(l *base.Lease, msg *base.TaskMessage) {
   300  	if !l.IsValid() {
   301  		// If lease is not valid, do not write to redis; Let recoverer take care of it.
   302  		return
   303  	}
   304  	ctx, _ := context.WithDeadline(context.Background(), l.Deadline())
   305  	err := p.broker.Done(ctx, msg)
   306  	if err != nil {
   307  		errMsg := fmt.Sprintf("Could not remove task id=%s type=%q from %q err: %+v",
   308  			msg.ID, msg.Type, base.ActiveKey(msg.Queue), err)
   309  		p.logger.Warnf("[Common] asynq %s; will retry syncing", errMsg)
   310  		p.syncRequestCh <- &syncRequest{
   311  			fn: func() error {
   312  				return p.broker.Done(ctx, msg)
   313  			},
   314  			errMsg:   errMsg,
   315  			deadline: l.Deadline(),
   316  		}
   317  	}
   318  }
   319  
   320  // SkipRetry is used as a return value from Handler.ProcessTask to indicate that
   321  // the task should not be retried and should be archived instead.
   322  var SkipRetry = errors.New("skip retry for the task")
   323  
   324  func (p *processor) handleFailedMessage(ctx context.Context, l *base.Lease, msg *base.TaskMessage, err error) {
   325  	if p.errHandler != nil {
   326  		p.errHandler.HandleError(ctx, NewTask(msg.Type, msg.Payload), err)
   327  	}
   328  	if !p.isFailureFunc(err) {
   329  		// retry the task without marking it as failed
   330  		p.retry(l, msg, err, false /* isFailure */)
   331  		return
   332  	}
   333  	if msg.Retried >= msg.Retry || errors.Is(err, SkipRetry) {
   334  		p.logger.Warnf("[Common] asynq retry exhausted for task id=%s", msg.ID)
   335  		p.archive(l, msg, err)
   336  	} else {
   337  		p.retry(l, msg, err, true /* isFailure */)
   338  	}
   339  }
   340  
   341  func (p *processor) retry(l *base.Lease, msg *base.TaskMessage, e error, isFailure bool) {
   342  	if !l.IsValid() {
   343  		// If lease is not valid, do not write to redis; Let recoverer take care of it.
   344  		return
   345  	}
   346  	ctx, _ := context.WithDeadline(context.Background(), l.Deadline())
   347  	d := p.retryDelayFunc(msg.Retried, e, NewTask(msg.Type, msg.Payload))
   348  	retryAt := time.Now().Add(d)
   349  	err := p.broker.Retry(ctx, msg, retryAt, e.Error(), isFailure)
   350  	if err != nil {
   351  		errMsg := fmt.Sprintf("Could not move task id=%s from %q to %q",
   352  			msg.ID, base.ActiveKey(msg.Queue), base.RetryKey(msg.Queue))
   353  		p.logger.Warnf("[Common] asynq %s; will retry syncing", errMsg)
   354  		p.syncRequestCh <- &syncRequest{
   355  			fn: func() error {
   356  				return p.broker.Retry(ctx, msg, retryAt, e.Error(), isFailure)
   357  			},
   358  			errMsg:   errMsg,
   359  			deadline: l.Deadline(),
   360  		}
   361  	}
   362  }
   363  
   364  func (p *processor) archive(l *base.Lease, msg *base.TaskMessage, e error) {
   365  	if !l.IsValid() {
   366  		// If lease is not valid, do not write to redis; Let recoverer take care of it.
   367  		return
   368  	}
   369  	ctx, _ := context.WithDeadline(context.Background(), l.Deadline())
   370  	err := p.broker.Archive(ctx, msg, e.Error())
   371  	if err != nil {
   372  		errMsg := fmt.Sprintf("Could not move task id=%s from %q to %q",
   373  			msg.ID, base.ActiveKey(msg.Queue), base.ArchivedKey(msg.Queue))
   374  		p.logger.Warnf("[Common] asynq %s; will retry syncing", errMsg)
   375  		p.syncRequestCh <- &syncRequest{
   376  			fn: func() error {
   377  				return p.broker.Archive(ctx, msg, e.Error())
   378  			},
   379  			errMsg:   errMsg,
   380  			deadline: l.Deadline(),
   381  		}
   382  	}
   383  }
   384  
   385  // queues returns a list of queues to query.
   386  // Order of the queue names is based on the priority of each queue.
   387  // Queue names is sorted by their priority level if strict-priority is true.
   388  // If strict-priority is false, then the order of queue names are roughly based on
   389  // the priority level but randomized in order to avoid starving low priority queues.
   390  func (p *processor) queues() []string {
   391  	// skip the overhead of generating a list of queue names
   392  	// if we are processing one queue.
   393  	if len(p.queueConfig) == 1 {
   394  		for qname := range p.queueConfig {
   395  			return []string{qname}
   396  		}
   397  	}
   398  	if p.orderedQueues != nil {
   399  		return p.orderedQueues
   400  	}
   401  	var names []string
   402  	for qname, priority := range p.queueConfig {
   403  		for i := 0; i < priority; i++ {
   404  			names = append(names, qname)
   405  		}
   406  	}
   407  	r := rand.New(rand.NewSource(time.Now().UnixNano()))
   408  	r.Shuffle(len(names), func(i, j int) { names[i], names[j] = names[j], names[i] })
   409  	return uniq(names, len(p.queueConfig))
   410  }
   411  
   412  // perform calls the handler with the given task.
   413  // If the call returns without panic, it simply returns the value,
   414  // otherwise, it recovers from panic and returns an error.
   415  func (p *processor) perform(ctx context.Context, task *Task) (err error) {
   416  	defer func() {
   417  		if x := recover(); x != nil {
   418  			errMsg := string(debug.Stack())
   419  
   420  			p.logger.Errorf("[Common] asynq recovering from panic: %s, See the stack trace below for details:\n%s",
   421  				x, errMsg)
   422  			_, file, line, ok := runtime.Caller(1) // skip the first frame (panic itself)
   423  			if ok && strings.Contains(file, "runtime/") {
   424  				// The panic came from the runtime, most likely due to incorrect
   425  				// map/slice usage. The parent frame should have the real trigger.
   426  				_, file, line, ok = runtime.Caller(2)
   427  			}
   428  
   429  			// Include the file and line number info in the error, if runtime.Caller returned ok.
   430  			if ok {
   431  				err = fmt.Errorf("panic [%s:%d]: %v", file, line, x)
   432  			} else {
   433  				err = fmt.Errorf("panic: %v", x)
   434  			}
   435  			err = &errors.PanicError{
   436  				ErrMsg: errMsg,
   437  			}
   438  		}
   439  	}()
   440  	return p.handler.ProcessTask(ctx, task)
   441  }
   442  
   443  // uniq dedupes elements and returns a slice of unique names of length l.
   444  // Order of the output slice is based on the input list.
   445  func uniq(names []string, l int) []string {
   446  	var res []string
   447  	seen := make(map[string]struct{})
   448  	for _, s := range names {
   449  		if _, ok := seen[s]; !ok {
   450  			seen[s] = struct{}{}
   451  			res = append(res, s)
   452  		}
   453  		if len(res) == l {
   454  			break
   455  		}
   456  	}
   457  	return res
   458  }
   459  
   460  // sortByPriority returns a list of queue names sorted by
   461  // their priority level in descending order.
   462  func sortByPriority(qcfg map[string]int) []string {
   463  	var queues []*queue
   464  	for qname, n := range qcfg {
   465  		queues = append(queues, &queue{qname, n})
   466  	}
   467  	sort.Sort(sort.Reverse(byPriority(queues)))
   468  	var res []string
   469  	for _, q := range queues {
   470  		res = append(res, q.name)
   471  	}
   472  	return res
   473  }
   474  
   475  type queue struct {
   476  	name     string
   477  	priority int
   478  }
   479  
   480  type byPriority []*queue
   481  
   482  func (x byPriority) Len() int           { return len(x) }
   483  func (x byPriority) Less(i, j int) bool { return x[i].priority < x[j].priority }
   484  func (x byPriority) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
   485  
   486  // normalizeQueues divides priority numbers by their greatest common divisor.
   487  func normalizeQueues(queues map[string]int) map[string]int {
   488  	var xs []int
   489  	for _, x := range queues {
   490  		xs = append(xs, x)
   491  	}
   492  	d := gcd(xs...)
   493  	res := make(map[string]int)
   494  	for q, x := range queues {
   495  		res[q] = x / d
   496  	}
   497  	return res
   498  }
   499  
   500  func gcd(xs ...int) int {
   501  	fn := func(x, y int) int {
   502  		for y > 0 {
   503  			x, y = y, x%y
   504  		}
   505  		return x
   506  	}
   507  	res := xs[0]
   508  	for i := 0; i < len(xs); i++ {
   509  		res = fn(xs[i], res)
   510  		if res == 1 {
   511  			return 1
   512  		}
   513  	}
   514  	return res
   515  }
   516  
   517  // computeDeadline returns the given task's deadline,
   518  func (p *processor) computeDeadline(msg *base.TaskMessage) time.Time {
   519  	if msg.Timeout == 0 && msg.Deadline == 0 {
   520  		p.logger.Errorf("[Common] asynq: internal error: "+
   521  			"both timeout and deadline are not set for the task message: %s", msg.ID)
   522  		return p.clock.Now().Add(defaultTimeout)
   523  	}
   524  	if msg.Timeout != 0 && msg.Deadline != 0 {
   525  		deadlineUnix := math.Min(float64(p.clock.Now().Unix()+msg.Timeout), float64(msg.Deadline))
   526  		return time.Unix(int64(deadlineUnix), 0)
   527  	}
   528  	if msg.Timeout != 0 {
   529  		return p.clock.Now().Add(time.Duration(msg.Timeout) * time.Second)
   530  	}
   531  	return time.Unix(msg.Deadline, 0)
   532  }
   533  
   534  func IsPanicError(err error) bool {
   535  	return errors.IsPanicError(err)
   536  }