github.com/gitbundle/modules@v0.0.0-20231025071548-85b91c5c3b01/queue/workerpool.go

github.com/gitbundle/modules@v0.0.0-20231025071548-85b91c5c3b01/queue/workerpool.go (about)

     1  // Copyright 2023 The GitBundle Inc. All rights reserved.
     2  // Copyright 2017 The Gitea Authors. All rights reserved.
     3  // Use of this source code is governed by a MIT-style
     4  // license that can be found in the LICENSE file.
     5  
     6  package queue
     7  
     8  import (
     9  	"context"
    10  	"fmt"
    11  	"runtime/pprof"
    12  	"sync"
    13  	"sync/atomic"
    14  	"time"
    15  
    16  	"github.com/gitbundle/modules/log"
    17  	"github.com/gitbundle/modules/process"
    18  	"github.com/gitbundle/modules/util"
    19  )
    20  
    21  // WorkerPool represent a dynamically growable worker pool for a
    22  // provided handler function. They have an internal channel which
    23  // they use to detect if there is a block and will grow and shrink in
    24  // response to demand as per configuration.
    25  type WorkerPool struct {
    26  	// This field requires to be the first one in the struct.
    27  	// This is to allow 64 bit atomic operations on 32-bit machines.
    28  	// See: https://pkg.go.dev/sync/atomic#pkg-note-BUG & GitBundle issue 19518
    29  	numInQueue         int64
    30  	lock               sync.Mutex
    31  	baseCtx            context.Context
    32  	baseCtxCancel      context.CancelFunc
    33  	baseCtxFinished    process.FinishedFunc
    34  	paused             chan struct{}
    35  	resumed            chan struct{}
    36  	cond               *sync.Cond
    37  	qid                int64
    38  	maxNumberOfWorkers int
    39  	numberOfWorkers    int
    40  	batchLength        int
    41  	handle             HandlerFunc
    42  	dataChan           chan Data
    43  	blockTimeout       time.Duration
    44  	boostTimeout       time.Duration
    45  	boostWorkers       int
    46  }
    47  
    48  var (
    49  	_ Flushable   = &WorkerPool{}
    50  	_ ManagedPool = &WorkerPool{}
    51  )
    52  
    53  // WorkerPoolConfiguration is the basic configuration for a WorkerPool
    54  type WorkerPoolConfiguration struct {
    55  	Name         string
    56  	QueueLength  int
    57  	BatchLength  int
    58  	BlockTimeout time.Duration
    59  	BoostTimeout time.Duration
    60  	BoostWorkers int
    61  	MaxWorkers   int
    62  }
    63  
    64  // NewWorkerPool creates a new worker pool
    65  func NewWorkerPool(handle HandlerFunc, config WorkerPoolConfiguration) *WorkerPool {
    66  	ctx, cancel, finished := process.GetManager().AddTypedContext(context.Background(), fmt.Sprintf("Queue: %s", config.Name), process.SystemProcessType, false)
    67  
    68  	dataChan := make(chan Data, config.QueueLength)
    69  	pool := &WorkerPool{
    70  		baseCtx:            ctx,
    71  		baseCtxCancel:      cancel,
    72  		baseCtxFinished:    finished,
    73  		batchLength:        config.BatchLength,
    74  		dataChan:           dataChan,
    75  		resumed:            closedChan,
    76  		paused:             make(chan struct{}),
    77  		handle:             handle,
    78  		blockTimeout:       config.BlockTimeout,
    79  		boostTimeout:       config.BoostTimeout,
    80  		boostWorkers:       config.BoostWorkers,
    81  		maxNumberOfWorkers: config.MaxWorkers,
    82  	}
    83  
    84  	return pool
    85  }
    86  
    87  // Done returns when this worker pool's base context has been cancelled
    88  func (p *WorkerPool) Done() <-chan struct{} {
    89  	return p.baseCtx.Done()
    90  }
    91  
    92  // Push pushes the data to the internal channel
    93  func (p *WorkerPool) Push(data Data) {
    94  	atomic.AddInt64(&p.numInQueue, 1)
    95  	p.lock.Lock()
    96  	select {
    97  	case <-p.paused:
    98  		p.lock.Unlock()
    99  		p.dataChan <- data
   100  		return
   101  	default:
   102  	}
   103  
   104  	if p.blockTimeout > 0 && p.boostTimeout > 0 && (p.numberOfWorkers <= p.maxNumberOfWorkers || p.maxNumberOfWorkers < 0) {
   105  		if p.numberOfWorkers == 0 {
   106  			p.zeroBoost()
   107  		} else {
   108  			p.lock.Unlock()
   109  		}
   110  		p.pushBoost(data)
   111  	} else {
   112  		p.lock.Unlock()
   113  		p.dataChan <- data
   114  	}
   115  }
   116  
   117  // HasNoWorkerScaling will return true if the queue has no workers, and has no worker boosting
   118  func (p *WorkerPool) HasNoWorkerScaling() bool {
   119  	p.lock.Lock()
   120  	defer p.lock.Unlock()
   121  	return p.hasNoWorkerScaling()
   122  }
   123  
   124  func (p *WorkerPool) hasNoWorkerScaling() bool {
   125  	return p.numberOfWorkers == 0 && (p.boostTimeout == 0 || p.boostWorkers == 0 || p.maxNumberOfWorkers == 0)
   126  }
   127  
   128  // zeroBoost will add a temporary boost worker for a no worker queue
   129  // p.lock must be locked at the start of this function BUT it will be unlocked by the end of this function
   130  // (This is because addWorkers has to be called whilst unlocked)
   131  func (p *WorkerPool) zeroBoost() {
   132  	ctx, cancel := context.WithTimeout(p.baseCtx, p.boostTimeout)
   133  	mq := GetManager().GetManagedQueue(p.qid)
   134  	boost := p.boostWorkers
   135  	if (boost+p.numberOfWorkers) > p.maxNumberOfWorkers && p.maxNumberOfWorkers >= 0 {
   136  		boost = p.maxNumberOfWorkers - p.numberOfWorkers
   137  	}
   138  	if mq != nil {
   139  		log.Debug("WorkerPool: %d (for %s) has zero workers - adding %d temporary workers for %s", p.qid, mq.Name, boost, p.boostTimeout)
   140  
   141  		start := time.Now()
   142  		pid := mq.RegisterWorkers(boost, start, true, start.Add(p.boostTimeout), cancel, false)
   143  		cancel = func() {
   144  			mq.RemoveWorkers(pid)
   145  		}
   146  	} else {
   147  		log.Debug("WorkerPool: %d has zero workers - adding %d temporary workers for %s", p.qid, p.boostWorkers, p.boostTimeout)
   148  	}
   149  	p.lock.Unlock()
   150  	p.addWorkers(ctx, cancel, boost)
   151  }
   152  
   153  func (p *WorkerPool) pushBoost(data Data) {
   154  	select {
   155  	case p.dataChan <- data:
   156  	default:
   157  		p.lock.Lock()
   158  		if p.blockTimeout <= 0 {
   159  			p.lock.Unlock()
   160  			p.dataChan <- data
   161  			return
   162  		}
   163  		ourTimeout := p.blockTimeout
   164  		timer := time.NewTimer(p.blockTimeout)
   165  		p.lock.Unlock()
   166  		select {
   167  		case p.dataChan <- data:
   168  			util.StopTimer(timer)
   169  		case <-timer.C:
   170  			p.lock.Lock()
   171  			if p.blockTimeout > ourTimeout || (p.numberOfWorkers > p.maxNumberOfWorkers && p.maxNumberOfWorkers >= 0) {
   172  				p.lock.Unlock()
   173  				p.dataChan <- data
   174  				return
   175  			}
   176  			p.blockTimeout *= 2
   177  			boostCtx, boostCtxCancel := context.WithCancel(p.baseCtx)
   178  			mq := GetManager().GetManagedQueue(p.qid)
   179  			boost := p.boostWorkers
   180  			if (boost+p.numberOfWorkers) > p.maxNumberOfWorkers && p.maxNumberOfWorkers >= 0 {
   181  				boost = p.maxNumberOfWorkers - p.numberOfWorkers
   182  			}
   183  			if mq != nil {
   184  				log.Debug("WorkerPool: %d (for %s) Channel blocked for %v - adding %d temporary workers for %s, block timeout now %v", p.qid, mq.Name, ourTimeout, boost, p.boostTimeout, p.blockTimeout)
   185  
   186  				start := time.Now()
   187  				pid := mq.RegisterWorkers(boost, start, true, start.Add(p.boostTimeout), boostCtxCancel, false)
   188  				go func() {
   189  					<-boostCtx.Done()
   190  					mq.RemoveWorkers(pid)
   191  					boostCtxCancel()
   192  				}()
   193  			} else {
   194  				log.Debug("WorkerPool: %d Channel blocked for %v - adding %d temporary workers for %s, block timeout now %v", p.qid, ourTimeout, p.boostWorkers, p.boostTimeout, p.blockTimeout)
   195  			}
   196  			go func() {
   197  				<-time.After(p.boostTimeout)
   198  				boostCtxCancel()
   199  				p.lock.Lock()
   200  				p.blockTimeout /= 2
   201  				p.lock.Unlock()
   202  			}()
   203  			p.lock.Unlock()
   204  			p.addWorkers(boostCtx, boostCtxCancel, boost)
   205  			p.dataChan <- data
   206  		}
   207  	}
   208  }
   209  
   210  // NumberOfWorkers returns the number of current workers in the pool
   211  func (p *WorkerPool) NumberOfWorkers() int {
   212  	p.lock.Lock()
   213  	defer p.lock.Unlock()
   214  	return p.numberOfWorkers
   215  }
   216  
   217  // NumberInQueue returns the number of items in the queue
   218  func (p *WorkerPool) NumberInQueue() int64 {
   219  	return atomic.LoadInt64(&p.numInQueue)
   220  }
   221  
   222  // MaxNumberOfWorkers returns the maximum number of workers automatically added to the pool
   223  func (p *WorkerPool) MaxNumberOfWorkers() int {
   224  	p.lock.Lock()
   225  	defer p.lock.Unlock()
   226  	return p.maxNumberOfWorkers
   227  }
   228  
   229  // BoostWorkers returns the number of workers for a boost
   230  func (p *WorkerPool) BoostWorkers() int {
   231  	p.lock.Lock()
   232  	defer p.lock.Unlock()
   233  	return p.boostWorkers
   234  }
   235  
   236  // BoostTimeout returns the timeout of the next boost
   237  func (p *WorkerPool) BoostTimeout() time.Duration {
   238  	p.lock.Lock()
   239  	defer p.lock.Unlock()
   240  	return p.boostTimeout
   241  }
   242  
   243  // BlockTimeout returns the timeout til the next boost
   244  func (p *WorkerPool) BlockTimeout() time.Duration {
   245  	p.lock.Lock()
   246  	defer p.lock.Unlock()
   247  	return p.blockTimeout
   248  }
   249  
   250  // SetPoolSettings sets the setable boost values
   251  func (p *WorkerPool) SetPoolSettings(maxNumberOfWorkers, boostWorkers int, timeout time.Duration) {
   252  	p.lock.Lock()
   253  	defer p.lock.Unlock()
   254  	p.maxNumberOfWorkers = maxNumberOfWorkers
   255  	p.boostWorkers = boostWorkers
   256  	p.boostTimeout = timeout
   257  }
   258  
   259  // SetMaxNumberOfWorkers sets the maximum number of workers automatically added to the pool
   260  // Changing this number will not change the number of current workers but will change the limit
   261  // for future additions
   262  func (p *WorkerPool) SetMaxNumberOfWorkers(newMax int) {
   263  	p.lock.Lock()
   264  	defer p.lock.Unlock()
   265  	p.maxNumberOfWorkers = newMax
   266  }
   267  
   268  func (p *WorkerPool) commonRegisterWorkers(number int, timeout time.Duration, isFlusher bool) (context.Context, context.CancelFunc) {
   269  	var ctx context.Context
   270  	var cancel context.CancelFunc
   271  	start := time.Now()
   272  	end := start
   273  	hasTimeout := false
   274  	if timeout > 0 {
   275  		ctx, cancel = context.WithTimeout(p.baseCtx, timeout)
   276  		end = start.Add(timeout)
   277  		hasTimeout = true
   278  	} else {
   279  		ctx, cancel = context.WithCancel(p.baseCtx)
   280  	}
   281  
   282  	mq := GetManager().GetManagedQueue(p.qid)
   283  	if mq != nil {
   284  		pid := mq.RegisterWorkers(number, start, hasTimeout, end, cancel, isFlusher)
   285  		log.Trace("WorkerPool: %d (for %s) adding %d workers with group id: %d", p.qid, mq.Name, number, pid)
   286  		return ctx, func() {
   287  			mq.RemoveWorkers(pid)
   288  		}
   289  	}
   290  	log.Trace("WorkerPool: %d adding %d workers (no group id)", p.qid, number)
   291  
   292  	return ctx, cancel
   293  }
   294  
   295  // AddWorkers adds workers to the pool - this allows the number of workers to go above the limit
   296  func (p *WorkerPool) AddWorkers(number int, timeout time.Duration) context.CancelFunc {
   297  	ctx, cancel := p.commonRegisterWorkers(number, timeout, false)
   298  	p.addWorkers(ctx, cancel, number)
   299  	return cancel
   300  }
   301  
   302  // addWorkers adds workers to the pool
   303  func (p *WorkerPool) addWorkers(ctx context.Context, cancel context.CancelFunc, number int) {
   304  	for i := 0; i < number; i++ {
   305  		p.lock.Lock()
   306  		if p.cond == nil {
   307  			p.cond = sync.NewCond(&p.lock)
   308  		}
   309  		p.numberOfWorkers++
   310  		p.lock.Unlock()
   311  		go func() {
   312  			pprof.SetGoroutineLabels(ctx)
   313  			p.doWork(ctx)
   314  
   315  			p.lock.Lock()
   316  			p.numberOfWorkers--
   317  			if p.numberOfWorkers == 0 {
   318  				p.cond.Broadcast()
   319  				cancel()
   320  			} else if p.numberOfWorkers < 0 {
   321  				// numberOfWorkers can't go negative but...
   322  				log.Warn("Number of Workers < 0 for QID %d - this shouldn't happen", p.qid)
   323  				p.numberOfWorkers = 0
   324  				p.cond.Broadcast()
   325  				cancel()
   326  			}
   327  			select {
   328  			case <-p.baseCtx.Done():
   329  				// Don't warn or check for ongoing work if the baseCtx is shutdown
   330  			case <-p.paused:
   331  				// Don't warn or check for ongoing work if the pool is paused
   332  			default:
   333  				if p.hasNoWorkerScaling() {
   334  					log.Warn(
   335  						"Queue: %d is configured to be non-scaling and has no workers - this configuration is likely incorrect.\n"+
   336  							"The queue will be paused to prevent data-loss with the assumption that you will add workers and unpause as required.", p.qid)
   337  					p.pause()
   338  				} else if p.numberOfWorkers == 0 && atomic.LoadInt64(&p.numInQueue) > 0 {
   339  					// OK there are no workers but... there's still work to be done -> Reboost
   340  					p.zeroBoost()
   341  					// p.lock will be unlocked by zeroBoost
   342  					return
   343  				}
   344  			}
   345  			p.lock.Unlock()
   346  		}()
   347  	}
   348  }
   349  
   350  // Wait for WorkerPool to finish
   351  func (p *WorkerPool) Wait() {
   352  	p.lock.Lock()
   353  	defer p.lock.Unlock()
   354  	if p.cond == nil {
   355  		p.cond = sync.NewCond(&p.lock)
   356  	}
   357  	if p.numberOfWorkers <= 0 {
   358  		return
   359  	}
   360  	p.cond.Wait()
   361  }
   362  
   363  // IsPaused returns if the pool is paused
   364  func (p *WorkerPool) IsPaused() bool {
   365  	p.lock.Lock()
   366  	defer p.lock.Unlock()
   367  	select {
   368  	case <-p.paused:
   369  		return true
   370  	default:
   371  		return false
   372  	}
   373  }
   374  
   375  // IsPausedIsResumed returns if the pool is paused and a channel that is closed when it is resumed
   376  func (p *WorkerPool) IsPausedIsResumed() (<-chan struct{}, <-chan struct{}) {
   377  	p.lock.Lock()
   378  	defer p.lock.Unlock()
   379  	return p.paused, p.resumed
   380  }
   381  
   382  // Pause pauses the WorkerPool
   383  func (p *WorkerPool) Pause() {
   384  	p.lock.Lock()
   385  	defer p.lock.Unlock()
   386  	p.pause()
   387  }
   388  
   389  func (p *WorkerPool) pause() {
   390  	select {
   391  	case <-p.paused:
   392  	default:
   393  		p.resumed = make(chan struct{})
   394  		close(p.paused)
   395  	}
   396  }
   397  
   398  // Resume resumes the WorkerPool
   399  func (p *WorkerPool) Resume() {
   400  	p.lock.Lock() // can't defer unlock because of the zeroBoost at the end
   401  	select {
   402  	case <-p.resumed:
   403  		// already resumed - there's nothing to do
   404  		p.lock.Unlock()
   405  		return
   406  	default:
   407  	}
   408  
   409  	p.paused = make(chan struct{})
   410  	close(p.resumed)
   411  
   412  	// OK now we need to check if we need to add some workers...
   413  	if p.numberOfWorkers > 0 || p.hasNoWorkerScaling() || atomic.LoadInt64(&p.numInQueue) == 0 {
   414  		// We either have workers, can't scale or there's no work to be done -> so just resume
   415  		p.lock.Unlock()
   416  		return
   417  	}
   418  
   419  	// OK we got some work but no workers we need to think about boosting
   420  	select {
   421  	case <-p.baseCtx.Done():
   422  		// don't bother boosting if the baseCtx is done
   423  		p.lock.Unlock()
   424  		return
   425  	default:
   426  	}
   427  
   428  	// OK we'd better add some boost workers!
   429  	p.zeroBoost()
   430  	// p.zeroBoost will unlock the lock
   431  }
   432  
   433  // CleanUp will drain the remaining contents of the channel
   434  // This should be called after AddWorkers context is closed
   435  func (p *WorkerPool) CleanUp(ctx context.Context) {
   436  	log.Trace("WorkerPool: %d CleanUp", p.qid)
   437  	close(p.dataChan)
   438  	for data := range p.dataChan {
   439  		if unhandled := p.handle(data); unhandled != nil {
   440  			if unhandled != nil {
   441  				log.Error("Unhandled Data in clean-up of queue %d", p.qid)
   442  			}
   443  		}
   444  
   445  		atomic.AddInt64(&p.numInQueue, -1)
   446  		select {
   447  		case <-ctx.Done():
   448  			log.Warn("WorkerPool: %d Cleanup context closed before finishing clean-up", p.qid)
   449  			return
   450  		default:
   451  		}
   452  	}
   453  	log.Trace("WorkerPool: %d CleanUp Done", p.qid)
   454  }
   455  
   456  // Flush flushes the channel with a timeout - the Flush worker will be registered as a flush worker with the manager
   457  func (p *WorkerPool) Flush(timeout time.Duration) error {
   458  	ctx, cancel := p.commonRegisterWorkers(1, timeout, true)
   459  	defer cancel()
   460  	return p.FlushWithContext(ctx)
   461  }
   462  
   463  // IsEmpty returns if true if the worker queue is empty
   464  func (p *WorkerPool) IsEmpty() bool {
   465  	return atomic.LoadInt64(&p.numInQueue) == 0
   466  }
   467  
   468  // FlushWithContext is very similar to CleanUp but it will return as soon as the dataChan is empty
   469  // NB: The worker will not be registered with the manager.
   470  func (p *WorkerPool) FlushWithContext(ctx context.Context) error {
   471  	log.Trace("WorkerPool: %d Flush", p.qid)
   472  	for {
   473  		select {
   474  		case data := <-p.dataChan:
   475  			if unhandled := p.handle(data); unhandled != nil {
   476  				log.Error("Unhandled Data whilst flushing queue %d", p.qid)
   477  			}
   478  			atomic.AddInt64(&p.numInQueue, -1)
   479  		case <-p.baseCtx.Done():
   480  			return p.baseCtx.Err()
   481  		case <-ctx.Done():
   482  			return ctx.Err()
   483  		default:
   484  			return nil
   485  		}
   486  	}
   487  }
   488  
   489  func (p *WorkerPool) doWork(ctx context.Context) {
   490  	pprof.SetGoroutineLabels(ctx)
   491  	delay := time.Millisecond * 300
   492  
   493  	// Create a common timer - we will use this elsewhere
   494  	timer := time.NewTimer(0)
   495  	util.StopTimer(timer)
   496  
   497  	paused, _ := p.IsPausedIsResumed()
   498  	data := make([]Data, 0, p.batchLength)
   499  	for {
   500  		select {
   501  		case <-paused:
   502  			log.Trace("Worker for Queue %d Pausing", p.qid)
   503  			if len(data) > 0 {
   504  				log.Trace("Handling: %d data, %v", len(data), data)
   505  				if unhandled := p.handle(data...); unhandled != nil {
   506  					log.Error("Unhandled Data in queue %d", p.qid)
   507  				}
   508  				atomic.AddInt64(&p.numInQueue, -1*int64(len(data)))
   509  			}
   510  			_, resumed := p.IsPausedIsResumed()
   511  			select {
   512  			case <-resumed:
   513  				paused, _ = p.IsPausedIsResumed()
   514  				log.Trace("Worker for Queue %d Resuming", p.qid)
   515  				util.StopTimer(timer)
   516  			case <-ctx.Done():
   517  				log.Trace("Worker shutting down")
   518  				return
   519  			}
   520  		default:
   521  		}
   522  		select {
   523  		case <-paused:
   524  			// go back around
   525  		case <-ctx.Done():
   526  			if len(data) > 0 {
   527  				log.Trace("Handling: %d data, %v", len(data), data)
   528  				if unhandled := p.handle(data...); unhandled != nil {
   529  					log.Error("Unhandled Data in queue %d", p.qid)
   530  				}
   531  				atomic.AddInt64(&p.numInQueue, -1*int64(len(data)))
   532  			}
   533  			log.Trace("Worker shutting down")
   534  			return
   535  		case datum, ok := <-p.dataChan:
   536  			if !ok {
   537  				// the dataChan has been closed - we should finish up:
   538  				if len(data) > 0 {
   539  					log.Trace("Handling: %d data, %v", len(data), data)
   540  					if unhandled := p.handle(data...); unhandled != nil {
   541  						log.Error("Unhandled Data in queue %d", p.qid)
   542  					}
   543  					atomic.AddInt64(&p.numInQueue, -1*int64(len(data)))
   544  				}
   545  				log.Trace("Worker shutting down")
   546  				return
   547  			}
   548  			data = append(data, datum)
   549  			util.StopTimer(timer)
   550  
   551  			if len(data) >= p.batchLength {
   552  				log.Trace("Handling: %d data, %v", len(data), data)
   553  				if unhandled := p.handle(data...); unhandled != nil {
   554  					log.Error("Unhandled Data in queue %d", p.qid)
   555  				}
   556  				atomic.AddInt64(&p.numInQueue, -1*int64(len(data)))
   557  				data = make([]Data, 0, p.batchLength)
   558  			} else {
   559  				timer.Reset(delay)
   560  			}
   561  		case <-timer.C:
   562  			delay = time.Millisecond * 100
   563  			if len(data) > 0 {
   564  				log.Trace("Handling: %d data, %v", len(data), data)
   565  				if unhandled := p.handle(data...); unhandled != nil {
   566  					log.Error("Unhandled Data in queue %d", p.qid)
   567  				}
   568  				atomic.AddInt64(&p.numInQueue, -1*int64(len(data)))
   569  				data = make([]Data, 0, p.batchLength)
   570  			}
   571  		}
   572  	}
   573  }