github.com/gitbundle/modules@v0.0.0-20231025071548-85b91c5c3b01/queue/workerpool.go (about) 1 // Copyright 2023 The GitBundle Inc. All rights reserved. 2 // Copyright 2017 The Gitea Authors. All rights reserved. 3 // Use of this source code is governed by a MIT-style 4 // license that can be found in the LICENSE file. 5 6 package queue 7 8 import ( 9 "context" 10 "fmt" 11 "runtime/pprof" 12 "sync" 13 "sync/atomic" 14 "time" 15 16 "github.com/gitbundle/modules/log" 17 "github.com/gitbundle/modules/process" 18 "github.com/gitbundle/modules/util" 19 ) 20 21 // WorkerPool represent a dynamically growable worker pool for a 22 // provided handler function. They have an internal channel which 23 // they use to detect if there is a block and will grow and shrink in 24 // response to demand as per configuration. 25 type WorkerPool struct { 26 // This field requires to be the first one in the struct. 27 // This is to allow 64 bit atomic operations on 32-bit machines. 28 // See: https://pkg.go.dev/sync/atomic#pkg-note-BUG & GitBundle issue 19518 29 numInQueue int64 30 lock sync.Mutex 31 baseCtx context.Context 32 baseCtxCancel context.CancelFunc 33 baseCtxFinished process.FinishedFunc 34 paused chan struct{} 35 resumed chan struct{} 36 cond *sync.Cond 37 qid int64 38 maxNumberOfWorkers int 39 numberOfWorkers int 40 batchLength int 41 handle HandlerFunc 42 dataChan chan Data 43 blockTimeout time.Duration 44 boostTimeout time.Duration 45 boostWorkers int 46 } 47 48 var ( 49 _ Flushable = &WorkerPool{} 50 _ ManagedPool = &WorkerPool{} 51 ) 52 53 // WorkerPoolConfiguration is the basic configuration for a WorkerPool 54 type WorkerPoolConfiguration struct { 55 Name string 56 QueueLength int 57 BatchLength int 58 BlockTimeout time.Duration 59 BoostTimeout time.Duration 60 BoostWorkers int 61 MaxWorkers int 62 } 63 64 // NewWorkerPool creates a new worker pool 65 func NewWorkerPool(handle HandlerFunc, config WorkerPoolConfiguration) *WorkerPool { 66 ctx, cancel, finished := process.GetManager().AddTypedContext(context.Background(), fmt.Sprintf("Queue: %s", config.Name), process.SystemProcessType, false) 67 68 dataChan := make(chan Data, config.QueueLength) 69 pool := &WorkerPool{ 70 baseCtx: ctx, 71 baseCtxCancel: cancel, 72 baseCtxFinished: finished, 73 batchLength: config.BatchLength, 74 dataChan: dataChan, 75 resumed: closedChan, 76 paused: make(chan struct{}), 77 handle: handle, 78 blockTimeout: config.BlockTimeout, 79 boostTimeout: config.BoostTimeout, 80 boostWorkers: config.BoostWorkers, 81 maxNumberOfWorkers: config.MaxWorkers, 82 } 83 84 return pool 85 } 86 87 // Done returns when this worker pool's base context has been cancelled 88 func (p *WorkerPool) Done() <-chan struct{} { 89 return p.baseCtx.Done() 90 } 91 92 // Push pushes the data to the internal channel 93 func (p *WorkerPool) Push(data Data) { 94 atomic.AddInt64(&p.numInQueue, 1) 95 p.lock.Lock() 96 select { 97 case <-p.paused: 98 p.lock.Unlock() 99 p.dataChan <- data 100 return 101 default: 102 } 103 104 if p.blockTimeout > 0 && p.boostTimeout > 0 && (p.numberOfWorkers <= p.maxNumberOfWorkers || p.maxNumberOfWorkers < 0) { 105 if p.numberOfWorkers == 0 { 106 p.zeroBoost() 107 } else { 108 p.lock.Unlock() 109 } 110 p.pushBoost(data) 111 } else { 112 p.lock.Unlock() 113 p.dataChan <- data 114 } 115 } 116 117 // HasNoWorkerScaling will return true if the queue has no workers, and has no worker boosting 118 func (p *WorkerPool) HasNoWorkerScaling() bool { 119 p.lock.Lock() 120 defer p.lock.Unlock() 121 return p.hasNoWorkerScaling() 122 } 123 124 func (p *WorkerPool) hasNoWorkerScaling() bool { 125 return p.numberOfWorkers == 0 && (p.boostTimeout == 0 || p.boostWorkers == 0 || p.maxNumberOfWorkers == 0) 126 } 127 128 // zeroBoost will add a temporary boost worker for a no worker queue 129 // p.lock must be locked at the start of this function BUT it will be unlocked by the end of this function 130 // (This is because addWorkers has to be called whilst unlocked) 131 func (p *WorkerPool) zeroBoost() { 132 ctx, cancel := context.WithTimeout(p.baseCtx, p.boostTimeout) 133 mq := GetManager().GetManagedQueue(p.qid) 134 boost := p.boostWorkers 135 if (boost+p.numberOfWorkers) > p.maxNumberOfWorkers && p.maxNumberOfWorkers >= 0 { 136 boost = p.maxNumberOfWorkers - p.numberOfWorkers 137 } 138 if mq != nil { 139 log.Debug("WorkerPool: %d (for %s) has zero workers - adding %d temporary workers for %s", p.qid, mq.Name, boost, p.boostTimeout) 140 141 start := time.Now() 142 pid := mq.RegisterWorkers(boost, start, true, start.Add(p.boostTimeout), cancel, false) 143 cancel = func() { 144 mq.RemoveWorkers(pid) 145 } 146 } else { 147 log.Debug("WorkerPool: %d has zero workers - adding %d temporary workers for %s", p.qid, p.boostWorkers, p.boostTimeout) 148 } 149 p.lock.Unlock() 150 p.addWorkers(ctx, cancel, boost) 151 } 152 153 func (p *WorkerPool) pushBoost(data Data) { 154 select { 155 case p.dataChan <- data: 156 default: 157 p.lock.Lock() 158 if p.blockTimeout <= 0 { 159 p.lock.Unlock() 160 p.dataChan <- data 161 return 162 } 163 ourTimeout := p.blockTimeout 164 timer := time.NewTimer(p.blockTimeout) 165 p.lock.Unlock() 166 select { 167 case p.dataChan <- data: 168 util.StopTimer(timer) 169 case <-timer.C: 170 p.lock.Lock() 171 if p.blockTimeout > ourTimeout || (p.numberOfWorkers > p.maxNumberOfWorkers && p.maxNumberOfWorkers >= 0) { 172 p.lock.Unlock() 173 p.dataChan <- data 174 return 175 } 176 p.blockTimeout *= 2 177 boostCtx, boostCtxCancel := context.WithCancel(p.baseCtx) 178 mq := GetManager().GetManagedQueue(p.qid) 179 boost := p.boostWorkers 180 if (boost+p.numberOfWorkers) > p.maxNumberOfWorkers && p.maxNumberOfWorkers >= 0 { 181 boost = p.maxNumberOfWorkers - p.numberOfWorkers 182 } 183 if mq != nil { 184 log.Debug("WorkerPool: %d (for %s) Channel blocked for %v - adding %d temporary workers for %s, block timeout now %v", p.qid, mq.Name, ourTimeout, boost, p.boostTimeout, p.blockTimeout) 185 186 start := time.Now() 187 pid := mq.RegisterWorkers(boost, start, true, start.Add(p.boostTimeout), boostCtxCancel, false) 188 go func() { 189 <-boostCtx.Done() 190 mq.RemoveWorkers(pid) 191 boostCtxCancel() 192 }() 193 } else { 194 log.Debug("WorkerPool: %d Channel blocked for %v - adding %d temporary workers for %s, block timeout now %v", p.qid, ourTimeout, p.boostWorkers, p.boostTimeout, p.blockTimeout) 195 } 196 go func() { 197 <-time.After(p.boostTimeout) 198 boostCtxCancel() 199 p.lock.Lock() 200 p.blockTimeout /= 2 201 p.lock.Unlock() 202 }() 203 p.lock.Unlock() 204 p.addWorkers(boostCtx, boostCtxCancel, boost) 205 p.dataChan <- data 206 } 207 } 208 } 209 210 // NumberOfWorkers returns the number of current workers in the pool 211 func (p *WorkerPool) NumberOfWorkers() int { 212 p.lock.Lock() 213 defer p.lock.Unlock() 214 return p.numberOfWorkers 215 } 216 217 // NumberInQueue returns the number of items in the queue 218 func (p *WorkerPool) NumberInQueue() int64 { 219 return atomic.LoadInt64(&p.numInQueue) 220 } 221 222 // MaxNumberOfWorkers returns the maximum number of workers automatically added to the pool 223 func (p *WorkerPool) MaxNumberOfWorkers() int { 224 p.lock.Lock() 225 defer p.lock.Unlock() 226 return p.maxNumberOfWorkers 227 } 228 229 // BoostWorkers returns the number of workers for a boost 230 func (p *WorkerPool) BoostWorkers() int { 231 p.lock.Lock() 232 defer p.lock.Unlock() 233 return p.boostWorkers 234 } 235 236 // BoostTimeout returns the timeout of the next boost 237 func (p *WorkerPool) BoostTimeout() time.Duration { 238 p.lock.Lock() 239 defer p.lock.Unlock() 240 return p.boostTimeout 241 } 242 243 // BlockTimeout returns the timeout til the next boost 244 func (p *WorkerPool) BlockTimeout() time.Duration { 245 p.lock.Lock() 246 defer p.lock.Unlock() 247 return p.blockTimeout 248 } 249 250 // SetPoolSettings sets the setable boost values 251 func (p *WorkerPool) SetPoolSettings(maxNumberOfWorkers, boostWorkers int, timeout time.Duration) { 252 p.lock.Lock() 253 defer p.lock.Unlock() 254 p.maxNumberOfWorkers = maxNumberOfWorkers 255 p.boostWorkers = boostWorkers 256 p.boostTimeout = timeout 257 } 258 259 // SetMaxNumberOfWorkers sets the maximum number of workers automatically added to the pool 260 // Changing this number will not change the number of current workers but will change the limit 261 // for future additions 262 func (p *WorkerPool) SetMaxNumberOfWorkers(newMax int) { 263 p.lock.Lock() 264 defer p.lock.Unlock() 265 p.maxNumberOfWorkers = newMax 266 } 267 268 func (p *WorkerPool) commonRegisterWorkers(number int, timeout time.Duration, isFlusher bool) (context.Context, context.CancelFunc) { 269 var ctx context.Context 270 var cancel context.CancelFunc 271 start := time.Now() 272 end := start 273 hasTimeout := false 274 if timeout > 0 { 275 ctx, cancel = context.WithTimeout(p.baseCtx, timeout) 276 end = start.Add(timeout) 277 hasTimeout = true 278 } else { 279 ctx, cancel = context.WithCancel(p.baseCtx) 280 } 281 282 mq := GetManager().GetManagedQueue(p.qid) 283 if mq != nil { 284 pid := mq.RegisterWorkers(number, start, hasTimeout, end, cancel, isFlusher) 285 log.Trace("WorkerPool: %d (for %s) adding %d workers with group id: %d", p.qid, mq.Name, number, pid) 286 return ctx, func() { 287 mq.RemoveWorkers(pid) 288 } 289 } 290 log.Trace("WorkerPool: %d adding %d workers (no group id)", p.qid, number) 291 292 return ctx, cancel 293 } 294 295 // AddWorkers adds workers to the pool - this allows the number of workers to go above the limit 296 func (p *WorkerPool) AddWorkers(number int, timeout time.Duration) context.CancelFunc { 297 ctx, cancel := p.commonRegisterWorkers(number, timeout, false) 298 p.addWorkers(ctx, cancel, number) 299 return cancel 300 } 301 302 // addWorkers adds workers to the pool 303 func (p *WorkerPool) addWorkers(ctx context.Context, cancel context.CancelFunc, number int) { 304 for i := 0; i < number; i++ { 305 p.lock.Lock() 306 if p.cond == nil { 307 p.cond = sync.NewCond(&p.lock) 308 } 309 p.numberOfWorkers++ 310 p.lock.Unlock() 311 go func() { 312 pprof.SetGoroutineLabels(ctx) 313 p.doWork(ctx) 314 315 p.lock.Lock() 316 p.numberOfWorkers-- 317 if p.numberOfWorkers == 0 { 318 p.cond.Broadcast() 319 cancel() 320 } else if p.numberOfWorkers < 0 { 321 // numberOfWorkers can't go negative but... 322 log.Warn("Number of Workers < 0 for QID %d - this shouldn't happen", p.qid) 323 p.numberOfWorkers = 0 324 p.cond.Broadcast() 325 cancel() 326 } 327 select { 328 case <-p.baseCtx.Done(): 329 // Don't warn or check for ongoing work if the baseCtx is shutdown 330 case <-p.paused: 331 // Don't warn or check for ongoing work if the pool is paused 332 default: 333 if p.hasNoWorkerScaling() { 334 log.Warn( 335 "Queue: %d is configured to be non-scaling and has no workers - this configuration is likely incorrect.\n"+ 336 "The queue will be paused to prevent data-loss with the assumption that you will add workers and unpause as required.", p.qid) 337 p.pause() 338 } else if p.numberOfWorkers == 0 && atomic.LoadInt64(&p.numInQueue) > 0 { 339 // OK there are no workers but... there's still work to be done -> Reboost 340 p.zeroBoost() 341 // p.lock will be unlocked by zeroBoost 342 return 343 } 344 } 345 p.lock.Unlock() 346 }() 347 } 348 } 349 350 // Wait for WorkerPool to finish 351 func (p *WorkerPool) Wait() { 352 p.lock.Lock() 353 defer p.lock.Unlock() 354 if p.cond == nil { 355 p.cond = sync.NewCond(&p.lock) 356 } 357 if p.numberOfWorkers <= 0 { 358 return 359 } 360 p.cond.Wait() 361 } 362 363 // IsPaused returns if the pool is paused 364 func (p *WorkerPool) IsPaused() bool { 365 p.lock.Lock() 366 defer p.lock.Unlock() 367 select { 368 case <-p.paused: 369 return true 370 default: 371 return false 372 } 373 } 374 375 // IsPausedIsResumed returns if the pool is paused and a channel that is closed when it is resumed 376 func (p *WorkerPool) IsPausedIsResumed() (<-chan struct{}, <-chan struct{}) { 377 p.lock.Lock() 378 defer p.lock.Unlock() 379 return p.paused, p.resumed 380 } 381 382 // Pause pauses the WorkerPool 383 func (p *WorkerPool) Pause() { 384 p.lock.Lock() 385 defer p.lock.Unlock() 386 p.pause() 387 } 388 389 func (p *WorkerPool) pause() { 390 select { 391 case <-p.paused: 392 default: 393 p.resumed = make(chan struct{}) 394 close(p.paused) 395 } 396 } 397 398 // Resume resumes the WorkerPool 399 func (p *WorkerPool) Resume() { 400 p.lock.Lock() // can't defer unlock because of the zeroBoost at the end 401 select { 402 case <-p.resumed: 403 // already resumed - there's nothing to do 404 p.lock.Unlock() 405 return 406 default: 407 } 408 409 p.paused = make(chan struct{}) 410 close(p.resumed) 411 412 // OK now we need to check if we need to add some workers... 413 if p.numberOfWorkers > 0 || p.hasNoWorkerScaling() || atomic.LoadInt64(&p.numInQueue) == 0 { 414 // We either have workers, can't scale or there's no work to be done -> so just resume 415 p.lock.Unlock() 416 return 417 } 418 419 // OK we got some work but no workers we need to think about boosting 420 select { 421 case <-p.baseCtx.Done(): 422 // don't bother boosting if the baseCtx is done 423 p.lock.Unlock() 424 return 425 default: 426 } 427 428 // OK we'd better add some boost workers! 429 p.zeroBoost() 430 // p.zeroBoost will unlock the lock 431 } 432 433 // CleanUp will drain the remaining contents of the channel 434 // This should be called after AddWorkers context is closed 435 func (p *WorkerPool) CleanUp(ctx context.Context) { 436 log.Trace("WorkerPool: %d CleanUp", p.qid) 437 close(p.dataChan) 438 for data := range p.dataChan { 439 if unhandled := p.handle(data); unhandled != nil { 440 if unhandled != nil { 441 log.Error("Unhandled Data in clean-up of queue %d", p.qid) 442 } 443 } 444 445 atomic.AddInt64(&p.numInQueue, -1) 446 select { 447 case <-ctx.Done(): 448 log.Warn("WorkerPool: %d Cleanup context closed before finishing clean-up", p.qid) 449 return 450 default: 451 } 452 } 453 log.Trace("WorkerPool: %d CleanUp Done", p.qid) 454 } 455 456 // Flush flushes the channel with a timeout - the Flush worker will be registered as a flush worker with the manager 457 func (p *WorkerPool) Flush(timeout time.Duration) error { 458 ctx, cancel := p.commonRegisterWorkers(1, timeout, true) 459 defer cancel() 460 return p.FlushWithContext(ctx) 461 } 462 463 // IsEmpty returns if true if the worker queue is empty 464 func (p *WorkerPool) IsEmpty() bool { 465 return atomic.LoadInt64(&p.numInQueue) == 0 466 } 467 468 // FlushWithContext is very similar to CleanUp but it will return as soon as the dataChan is empty 469 // NB: The worker will not be registered with the manager. 470 func (p *WorkerPool) FlushWithContext(ctx context.Context) error { 471 log.Trace("WorkerPool: %d Flush", p.qid) 472 for { 473 select { 474 case data := <-p.dataChan: 475 if unhandled := p.handle(data); unhandled != nil { 476 log.Error("Unhandled Data whilst flushing queue %d", p.qid) 477 } 478 atomic.AddInt64(&p.numInQueue, -1) 479 case <-p.baseCtx.Done(): 480 return p.baseCtx.Err() 481 case <-ctx.Done(): 482 return ctx.Err() 483 default: 484 return nil 485 } 486 } 487 } 488 489 func (p *WorkerPool) doWork(ctx context.Context) { 490 pprof.SetGoroutineLabels(ctx) 491 delay := time.Millisecond * 300 492 493 // Create a common timer - we will use this elsewhere 494 timer := time.NewTimer(0) 495 util.StopTimer(timer) 496 497 paused, _ := p.IsPausedIsResumed() 498 data := make([]Data, 0, p.batchLength) 499 for { 500 select { 501 case <-paused: 502 log.Trace("Worker for Queue %d Pausing", p.qid) 503 if len(data) > 0 { 504 log.Trace("Handling: %d data, %v", len(data), data) 505 if unhandled := p.handle(data...); unhandled != nil { 506 log.Error("Unhandled Data in queue %d", p.qid) 507 } 508 atomic.AddInt64(&p.numInQueue, -1*int64(len(data))) 509 } 510 _, resumed := p.IsPausedIsResumed() 511 select { 512 case <-resumed: 513 paused, _ = p.IsPausedIsResumed() 514 log.Trace("Worker for Queue %d Resuming", p.qid) 515 util.StopTimer(timer) 516 case <-ctx.Done(): 517 log.Trace("Worker shutting down") 518 return 519 } 520 default: 521 } 522 select { 523 case <-paused: 524 // go back around 525 case <-ctx.Done(): 526 if len(data) > 0 { 527 log.Trace("Handling: %d data, %v", len(data), data) 528 if unhandled := p.handle(data...); unhandled != nil { 529 log.Error("Unhandled Data in queue %d", p.qid) 530 } 531 atomic.AddInt64(&p.numInQueue, -1*int64(len(data))) 532 } 533 log.Trace("Worker shutting down") 534 return 535 case datum, ok := <-p.dataChan: 536 if !ok { 537 // the dataChan has been closed - we should finish up: 538 if len(data) > 0 { 539 log.Trace("Handling: %d data, %v", len(data), data) 540 if unhandled := p.handle(data...); unhandled != nil { 541 log.Error("Unhandled Data in queue %d", p.qid) 542 } 543 atomic.AddInt64(&p.numInQueue, -1*int64(len(data))) 544 } 545 log.Trace("Worker shutting down") 546 return 547 } 548 data = append(data, datum) 549 util.StopTimer(timer) 550 551 if len(data) >= p.batchLength { 552 log.Trace("Handling: %d data, %v", len(data), data) 553 if unhandled := p.handle(data...); unhandled != nil { 554 log.Error("Unhandled Data in queue %d", p.qid) 555 } 556 atomic.AddInt64(&p.numInQueue, -1*int64(len(data))) 557 data = make([]Data, 0, p.batchLength) 558 } else { 559 timer.Reset(delay) 560 } 561 case <-timer.C: 562 delay = time.Millisecond * 100 563 if len(data) > 0 { 564 log.Trace("Handling: %d data, %v", len(data), data) 565 if unhandled := p.handle(data...); unhandled != nil { 566 log.Error("Unhandled Data in queue %d", p.qid) 567 } 568 atomic.AddInt64(&p.numInQueue, -1*int64(len(data))) 569 data = make([]Data, 0, p.batchLength) 570 } 571 } 572 } 573 }