code.gitea.io/gitea@v1.22.3/modules/queue/workergroup.go (about)

     1  // Copyright 2023 The Gitea Authors. All rights reserved.
     2  // SPDX-License-Identifier: MIT
     3  
     4  package queue
     5  
     6  import (
     7  	"context"
     8  	"runtime/pprof"
     9  	"sync"
    10  	"sync/atomic"
    11  	"time"
    12  
    13  	"code.gitea.io/gitea/modules/log"
    14  )
    15  
    16  var (
    17  	infiniteTimerC         = make(chan time.Time)
    18  	batchDebounceDuration  = 100 * time.Millisecond
    19  	workerIdleDuration     = 1 * time.Second
    20  	shutdownDefaultTimeout = 2 * time.Second
    21  
    22  	unhandledItemRequeueDuration atomic.Int64 // to avoid data race during test
    23  )
    24  
    25  func init() {
    26  	unhandledItemRequeueDuration.Store(int64(5 * time.Second))
    27  }
    28  
    29  // workerGroup is a group of workers to work with a WorkerPoolQueue
    30  type workerGroup[T any] struct {
    31  	q  *WorkerPoolQueue[T]
    32  	wg sync.WaitGroup
    33  
    34  	ctxWorker       context.Context
    35  	ctxWorkerCancel context.CancelFunc
    36  
    37  	batchBuffer []T
    38  	popItemChan chan []byte
    39  	popItemErr  chan error
    40  }
    41  
    42  func (wg *workerGroup[T]) doPrepareWorkerContext() {
    43  	wg.ctxWorker, wg.ctxWorkerCancel = context.WithCancel(wg.q.ctxRun)
    44  }
    45  
    46  // doDispatchBatchToWorker dispatches a batch of items to worker's channel.
    47  // If the channel is full, it tries to start a new worker if possible.
    48  func (q *WorkerPoolQueue[T]) doDispatchBatchToWorker(wg *workerGroup[T], flushChan chan flushType) {
    49  	batch := wg.batchBuffer
    50  	wg.batchBuffer = nil
    51  
    52  	if len(batch) == 0 {
    53  		return
    54  	}
    55  
    56  	full := false
    57  	select {
    58  	case q.batchChan <- batch:
    59  	default:
    60  		full = true
    61  	}
    62  
    63  	// TODO: the logic could be improved in the future, to avoid a data-race between "doStartNewWorker" and "workerNum"
    64  	// The root problem is that if we skip "doStartNewWorker" here, the "workerNum" might be decreased by other workers later
    65  	// So ideally, it should check whether there are enough workers by some approaches, and start new workers if necessary.
    66  	// This data-race is not serious, as long as a new worker will be started soon to make sure there are enough workers,
    67  	// so no need to hugely refactor at the moment.
    68  	q.workerNumMu.Lock()
    69  	noWorker := q.workerNum == 0
    70  	if full || noWorker {
    71  		if q.workerNum < q.workerMaxNum || noWorker && q.workerMaxNum <= 0 {
    72  			q.workerNum++
    73  			q.doStartNewWorker(wg)
    74  		}
    75  	}
    76  	q.workerNumMu.Unlock()
    77  
    78  	if full {
    79  		select {
    80  		case q.batchChan <- batch:
    81  		case flush := <-flushChan:
    82  			q.doWorkerHandle(batch)
    83  			q.doFlush(wg, flush)
    84  		case <-q.ctxRun.Done():
    85  			wg.batchBuffer = batch // return the batch to buffer, the "doRun" function will handle it
    86  		}
    87  	}
    88  }
    89  
    90  // doWorkerHandle calls the safeHandler to handle a batch of items, and it increases/decreases the active worker number.
    91  // If the context has been canceled, it should not be caller because the "Push" still needs the context, in such case, call q.safeHandler directly
    92  func (q *WorkerPoolQueue[T]) doWorkerHandle(batch []T) {
    93  	q.workerNumMu.Lock()
    94  	q.workerActiveNum++
    95  	q.workerNumMu.Unlock()
    96  
    97  	defer func() {
    98  		q.workerNumMu.Lock()
    99  		q.workerActiveNum--
   100  		q.workerNumMu.Unlock()
   101  	}()
   102  
   103  	unhandled := q.safeHandler(batch...)
   104  	// if none of the items were handled, it should back-off for a few seconds
   105  	// in this case the handler (eg: document indexer) may have encountered some errors/failures
   106  	if len(unhandled) == len(batch) && unhandledItemRequeueDuration.Load() != 0 {
   107  		log.Error("Queue %q failed to handle batch of %d items, backoff for a few seconds", q.GetName(), len(batch))
   108  		select {
   109  		case <-q.ctxRun.Done():
   110  		case <-time.After(time.Duration(unhandledItemRequeueDuration.Load())):
   111  		}
   112  	}
   113  	for _, item := range unhandled {
   114  		if err := q.Push(item); err != nil {
   115  			if !q.basePushForShutdown(item) {
   116  				log.Error("Failed to requeue item for queue %q when calling handler: %v", q.GetName(), err)
   117  			}
   118  		}
   119  	}
   120  }
   121  
   122  // basePushForShutdown tries to requeue items into the base queue when the WorkerPoolQueue is shutting down.
   123  // If the queue is shutting down, it returns true and try to push the items
   124  // Otherwise it does nothing and returns false
   125  func (q *WorkerPoolQueue[T]) basePushForShutdown(items ...T) bool {
   126  	shutdownTimeout := time.Duration(q.shutdownTimeout.Load())
   127  	if shutdownTimeout == 0 {
   128  		return false
   129  	}
   130  	ctxShutdown, ctxShutdownCancel := context.WithTimeout(context.Background(), shutdownTimeout)
   131  	defer ctxShutdownCancel()
   132  	for _, item := range items {
   133  		// if there is still any error, the queue can do nothing instead of losing the items
   134  		if err := q.baseQueue.PushItem(ctxShutdown, q.marshal(item)); err != nil {
   135  			log.Error("Failed to requeue item for queue %q when shutting down: %v", q.GetName(), err)
   136  		}
   137  	}
   138  	return true
   139  }
   140  
   141  func resetIdleTicker(t *time.Ticker, dur time.Duration) {
   142  	t.Reset(dur)
   143  	select {
   144  	case <-t.C:
   145  	default:
   146  	}
   147  }
   148  
   149  // doStartNewWorker starts a new worker for the queue, the worker reads from worker's channel and handles the items.
   150  func (q *WorkerPoolQueue[T]) doStartNewWorker(wp *workerGroup[T]) {
   151  	wp.wg.Add(1)
   152  
   153  	go func() {
   154  		defer wp.wg.Done()
   155  
   156  		log.Debug("Queue %q starts new worker", q.GetName())
   157  		defer log.Debug("Queue %q stops idle worker", q.GetName())
   158  
   159  		t := time.NewTicker(workerIdleDuration)
   160  		defer t.Stop()
   161  
   162  		keepWorking := true
   163  		stopWorking := func() {
   164  			q.workerNumMu.Lock()
   165  			keepWorking = false
   166  			q.workerNum--
   167  			q.workerNumMu.Unlock()
   168  		}
   169  		for keepWorking {
   170  			select {
   171  			case <-wp.ctxWorker.Done():
   172  				stopWorking()
   173  			case batch, ok := <-q.batchChan:
   174  				if !ok {
   175  					stopWorking()
   176  					continue
   177  				}
   178  				q.doWorkerHandle(batch)
   179  				// reset the idle ticker, and drain the tick after reset in case a tick is already triggered
   180  				resetIdleTicker(t, workerIdleDuration) // key code for TestWorkerPoolQueueWorkerIdleReset
   181  			case <-t.C:
   182  				q.workerNumMu.Lock()
   183  				keepWorking = q.workerNum <= 1 // keep the last worker running
   184  				if !keepWorking {
   185  					q.workerNum--
   186  				}
   187  				q.workerNumMu.Unlock()
   188  			}
   189  		}
   190  	}()
   191  }
   192  
   193  // doFlush flushes the queue: it tries to read all items from the queue and handles them.
   194  // It is for testing purpose only. It's not designed to work for a cluster.
   195  func (q *WorkerPoolQueue[T]) doFlush(wg *workerGroup[T], flush flushType) {
   196  	log.Debug("Queue %q starts flushing", q.GetName())
   197  	defer log.Debug("Queue %q finishes flushing", q.GetName())
   198  
   199  	// stop all workers, and prepare a new worker context to start new workers
   200  
   201  	wg.ctxWorkerCancel()
   202  	wg.wg.Wait()
   203  
   204  	defer func() {
   205  		close(flush)
   206  		wg.doPrepareWorkerContext()
   207  	}()
   208  
   209  	// drain the batch channel first
   210  loop:
   211  	for {
   212  		select {
   213  		case batch := <-q.batchChan:
   214  			q.doWorkerHandle(batch)
   215  		default:
   216  			break loop
   217  		}
   218  	}
   219  
   220  	// drain the popItem channel
   221  	emptyCounter := 0
   222  	for {
   223  		select {
   224  		case data, dataOk := <-wg.popItemChan:
   225  			if !dataOk {
   226  				return
   227  			}
   228  			emptyCounter = 0
   229  			if v, jsonOk := q.unmarshal(data); !jsonOk {
   230  				continue
   231  			} else {
   232  				q.doWorkerHandle([]T{v})
   233  			}
   234  		case err := <-wg.popItemErr:
   235  			if !q.isCtxRunCanceled() {
   236  				log.Error("Failed to pop item from queue %q (doFlush): %v", q.GetName(), err)
   237  			}
   238  			return
   239  		case <-q.ctxRun.Done():
   240  			log.Debug("Queue %q is shutting down", q.GetName())
   241  			return
   242  		case <-time.After(20 * time.Millisecond):
   243  			// There is no reliable way to make sure all queue items are consumed by the Flush, there always might be some items stored in some buffers/temp variables.
   244  			// If we run Gitea in a cluster, we can even not guarantee all items are consumed in a deterministic instance.
   245  			// Luckily, the "Flush" trick is only used in tests, so far so good.
   246  			if cnt, _ := q.baseQueue.Len(q.ctxRun); cnt == 0 && len(wg.popItemChan) == 0 {
   247  				emptyCounter++
   248  			}
   249  			if emptyCounter >= 2 {
   250  				return
   251  			}
   252  		}
   253  	}
   254  }
   255  
   256  func (q *WorkerPoolQueue[T]) isCtxRunCanceled() bool {
   257  	select {
   258  	case <-q.ctxRun.Done():
   259  		return true
   260  	default:
   261  		return false
   262  	}
   263  }
   264  
   265  var skipFlushChan = make(chan flushType) // an empty flush chan, used to skip reading other flush requests
   266  
   267  // doRun is the main loop of the queue. All related "doXxx" functions are executed in its context.
   268  func (q *WorkerPoolQueue[T]) doRun() {
   269  	pprof.SetGoroutineLabels(q.ctxRun)
   270  
   271  	log.Debug("Queue %q starts running", q.GetName())
   272  	defer log.Debug("Queue %q stops running", q.GetName())
   273  
   274  	wg := &workerGroup[T]{q: q}
   275  	wg.doPrepareWorkerContext()
   276  	wg.popItemChan, wg.popItemErr = popItemByChan(q.ctxRun, q.baseQueue.PopItem)
   277  
   278  	defer func() {
   279  		q.ctxRunCancel()
   280  
   281  		// drain all data on the fly
   282  		// since the queue is shutting down, the items can't be dispatched to workers because the context is canceled
   283  		// it can't call doWorkerHandle either, because there is no chance to push unhandled items back to the queue
   284  		var unhandled []T
   285  		close(q.batchChan)
   286  		for batch := range q.batchChan {
   287  			unhandled = append(unhandled, batch...)
   288  		}
   289  		unhandled = append(unhandled, wg.batchBuffer...)
   290  		for data := range wg.popItemChan {
   291  			if v, ok := q.unmarshal(data); ok {
   292  				unhandled = append(unhandled, v)
   293  			}
   294  		}
   295  
   296  		shutdownTimeout := time.Duration(q.shutdownTimeout.Load())
   297  		if shutdownTimeout != 0 {
   298  			// if there is a shutdown context, try to push the items back to the base queue
   299  			q.basePushForShutdown(unhandled...)
   300  			workerDone := make(chan struct{})
   301  			// the only way to wait for the workers, because the handlers do not have context to wait for
   302  			go func() { wg.wg.Wait(); close(workerDone) }()
   303  			select {
   304  			case <-workerDone:
   305  			case <-time.After(shutdownTimeout):
   306  				log.Error("Queue %q is shutting down, but workers are still running after timeout", q.GetName())
   307  			}
   308  		} else {
   309  			// if there is no shutdown context, just call the handler to try to handle the items. if the handler fails again, the items are lost
   310  			q.safeHandler(unhandled...)
   311  		}
   312  
   313  		close(q.shutdownDone)
   314  	}()
   315  
   316  	var batchDispatchC <-chan time.Time = infiniteTimerC
   317  	for {
   318  		select {
   319  		case data, dataOk := <-wg.popItemChan:
   320  			if !dataOk {
   321  				return
   322  			}
   323  			if v, jsonOk := q.unmarshal(data); !jsonOk {
   324  				testRecorder.Record("pop:corrupted:%s", data) // in rare cases the levelqueue(leveldb) might be corrupted
   325  				continue
   326  			} else {
   327  				wg.batchBuffer = append(wg.batchBuffer, v)
   328  			}
   329  			if len(wg.batchBuffer) >= q.batchLength {
   330  				q.doDispatchBatchToWorker(wg, q.flushChan)
   331  			} else if batchDispatchC == infiniteTimerC {
   332  				batchDispatchC = time.After(batchDebounceDuration)
   333  			} // else: batchDispatchC is already a debounce timer, it will be triggered soon
   334  		case <-batchDispatchC:
   335  			batchDispatchC = infiniteTimerC
   336  			q.doDispatchBatchToWorker(wg, q.flushChan)
   337  		case flush := <-q.flushChan:
   338  			// before flushing, it needs to try to dispatch the batch to worker first, in case there is no worker running
   339  			// after the flushing, there is at least one worker running, so "doFlush" could wait for workers to finish
   340  			// since we are already in a "flush" operation, so the dispatching function shouldn't read the flush chan.
   341  			q.doDispatchBatchToWorker(wg, skipFlushChan)
   342  			q.doFlush(wg, flush)
   343  		case err := <-wg.popItemErr:
   344  			if !q.isCtxRunCanceled() {
   345  				log.Error("Failed to pop item from queue %q (doRun): %v", q.GetName(), err)
   346  			}
   347  			return
   348  		case <-q.ctxRun.Done():
   349  			log.Debug("Queue %q is shutting down", q.GetName())
   350  			return
   351  		}
   352  	}
   353  }