code.gitea.io/gitea@v1.19.3/modules/queue/workerpool.go (about) 1 // Copyright 2019 The Gitea Authors. All rights reserved. 2 // SPDX-License-Identifier: MIT 3 4 package queue 5 6 import ( 7 "context" 8 "fmt" 9 "runtime/pprof" 10 "sync" 11 "sync/atomic" 12 "time" 13 14 "code.gitea.io/gitea/modules/log" 15 "code.gitea.io/gitea/modules/process" 16 "code.gitea.io/gitea/modules/util" 17 ) 18 19 // WorkerPool represent a dynamically growable worker pool for a 20 // provided handler function. They have an internal channel which 21 // they use to detect if there is a block and will grow and shrink in 22 // response to demand as per configuration. 23 type WorkerPool struct { 24 // This field requires to be the first one in the struct. 25 // This is to allow 64 bit atomic operations on 32-bit machines. 26 // See: https://pkg.go.dev/sync/atomic#pkg-note-BUG & Gitea issue 19518 27 numInQueue int64 28 lock sync.Mutex 29 baseCtx context.Context 30 baseCtxCancel context.CancelFunc 31 baseCtxFinished process.FinishedFunc 32 paused chan struct{} 33 resumed chan struct{} 34 cond *sync.Cond 35 qid int64 36 maxNumberOfWorkers int 37 numberOfWorkers int 38 batchLength int 39 handle HandlerFunc 40 dataChan chan Data 41 blockTimeout time.Duration 42 boostTimeout time.Duration 43 boostWorkers int 44 } 45 46 var ( 47 _ Flushable = &WorkerPool{} 48 _ ManagedPool = &WorkerPool{} 49 ) 50 51 // WorkerPoolConfiguration is the basic configuration for a WorkerPool 52 type WorkerPoolConfiguration struct { 53 Name string 54 QueueLength int 55 BatchLength int 56 BlockTimeout time.Duration 57 BoostTimeout time.Duration 58 BoostWorkers int 59 MaxWorkers int 60 } 61 62 // NewWorkerPool creates a new worker pool 63 func NewWorkerPool(handle HandlerFunc, config WorkerPoolConfiguration) *WorkerPool { 64 ctx, cancel, finished := process.GetManager().AddTypedContext(context.Background(), fmt.Sprintf("Queue: %s", config.Name), process.SystemProcessType, false) 65 66 dataChan := make(chan Data, config.QueueLength) 67 pool := &WorkerPool{ 68 baseCtx: ctx, 69 baseCtxCancel: cancel, 70 baseCtxFinished: finished, 71 batchLength: config.BatchLength, 72 dataChan: dataChan, 73 resumed: closedChan, 74 paused: make(chan struct{}), 75 handle: handle, 76 blockTimeout: config.BlockTimeout, 77 boostTimeout: config.BoostTimeout, 78 boostWorkers: config.BoostWorkers, 79 maxNumberOfWorkers: config.MaxWorkers, 80 } 81 82 return pool 83 } 84 85 // Done returns when this worker pool's base context has been cancelled 86 func (p *WorkerPool) Done() <-chan struct{} { 87 return p.baseCtx.Done() 88 } 89 90 // Push pushes the data to the internal channel 91 func (p *WorkerPool) Push(data Data) { 92 atomic.AddInt64(&p.numInQueue, 1) 93 p.lock.Lock() 94 select { 95 case <-p.paused: 96 p.lock.Unlock() 97 p.dataChan <- data 98 return 99 default: 100 } 101 102 if p.blockTimeout > 0 && p.boostTimeout > 0 && (p.numberOfWorkers <= p.maxNumberOfWorkers || p.maxNumberOfWorkers < 0) { 103 if p.numberOfWorkers == 0 { 104 p.zeroBoost() 105 } else { 106 p.lock.Unlock() 107 } 108 p.pushBoost(data) 109 } else { 110 p.lock.Unlock() 111 p.dataChan <- data 112 } 113 } 114 115 // HasNoWorkerScaling will return true if the queue has no workers, and has no worker boosting 116 func (p *WorkerPool) HasNoWorkerScaling() bool { 117 p.lock.Lock() 118 defer p.lock.Unlock() 119 return p.hasNoWorkerScaling() 120 } 121 122 func (p *WorkerPool) hasNoWorkerScaling() bool { 123 return p.numberOfWorkers == 0 && (p.boostTimeout == 0 || p.boostWorkers == 0 || p.maxNumberOfWorkers == 0) 124 } 125 126 // zeroBoost will add a temporary boost worker for a no worker queue 127 // p.lock must be locked at the start of this function BUT it will be unlocked by the end of this function 128 // (This is because addWorkers has to be called whilst unlocked) 129 func (p *WorkerPool) zeroBoost() { 130 ctx, cancel := context.WithTimeout(p.baseCtx, p.boostTimeout) 131 mq := GetManager().GetManagedQueue(p.qid) 132 boost := p.boostWorkers 133 if (boost+p.numberOfWorkers) > p.maxNumberOfWorkers && p.maxNumberOfWorkers >= 0 { 134 boost = p.maxNumberOfWorkers - p.numberOfWorkers 135 } 136 if mq != nil { 137 log.Debug("WorkerPool: %d (for %s) has zero workers - adding %d temporary workers for %s", p.qid, mq.Name, boost, p.boostTimeout) 138 139 start := time.Now() 140 pid := mq.RegisterWorkers(boost, start, true, start.Add(p.boostTimeout), cancel, false) 141 cancel = func() { 142 mq.RemoveWorkers(pid) 143 } 144 } else { 145 log.Debug("WorkerPool: %d has zero workers - adding %d temporary workers for %s", p.qid, p.boostWorkers, p.boostTimeout) 146 } 147 p.lock.Unlock() 148 p.addWorkers(ctx, cancel, boost) 149 } 150 151 func (p *WorkerPool) pushBoost(data Data) { 152 select { 153 case p.dataChan <- data: 154 default: 155 p.lock.Lock() 156 if p.blockTimeout <= 0 { 157 p.lock.Unlock() 158 p.dataChan <- data 159 return 160 } 161 ourTimeout := p.blockTimeout 162 timer := time.NewTimer(p.blockTimeout) 163 p.lock.Unlock() 164 select { 165 case p.dataChan <- data: 166 util.StopTimer(timer) 167 case <-timer.C: 168 p.lock.Lock() 169 if p.blockTimeout > ourTimeout || (p.numberOfWorkers > p.maxNumberOfWorkers && p.maxNumberOfWorkers >= 0) { 170 p.lock.Unlock() 171 p.dataChan <- data 172 return 173 } 174 p.blockTimeout *= 2 175 boostCtx, boostCtxCancel := context.WithCancel(p.baseCtx) 176 mq := GetManager().GetManagedQueue(p.qid) 177 boost := p.boostWorkers 178 if (boost+p.numberOfWorkers) > p.maxNumberOfWorkers && p.maxNumberOfWorkers >= 0 { 179 boost = p.maxNumberOfWorkers - p.numberOfWorkers 180 } 181 if mq != nil { 182 log.Debug("WorkerPool: %d (for %s) Channel blocked for %v - adding %d temporary workers for %s, block timeout now %v", p.qid, mq.Name, ourTimeout, boost, p.boostTimeout, p.blockTimeout) 183 184 start := time.Now() 185 pid := mq.RegisterWorkers(boost, start, true, start.Add(p.boostTimeout), boostCtxCancel, false) 186 go func() { 187 <-boostCtx.Done() 188 mq.RemoveWorkers(pid) 189 boostCtxCancel() 190 }() 191 } else { 192 log.Debug("WorkerPool: %d Channel blocked for %v - adding %d temporary workers for %s, block timeout now %v", p.qid, ourTimeout, p.boostWorkers, p.boostTimeout, p.blockTimeout) 193 } 194 go func() { 195 <-time.After(p.boostTimeout) 196 boostCtxCancel() 197 p.lock.Lock() 198 p.blockTimeout /= 2 199 p.lock.Unlock() 200 }() 201 p.lock.Unlock() 202 p.addWorkers(boostCtx, boostCtxCancel, boost) 203 p.dataChan <- data 204 } 205 } 206 } 207 208 // NumberOfWorkers returns the number of current workers in the pool 209 func (p *WorkerPool) NumberOfWorkers() int { 210 p.lock.Lock() 211 defer p.lock.Unlock() 212 return p.numberOfWorkers 213 } 214 215 // NumberInQueue returns the number of items in the queue 216 func (p *WorkerPool) NumberInQueue() int64 { 217 return atomic.LoadInt64(&p.numInQueue) 218 } 219 220 // MaxNumberOfWorkers returns the maximum number of workers automatically added to the pool 221 func (p *WorkerPool) MaxNumberOfWorkers() int { 222 p.lock.Lock() 223 defer p.lock.Unlock() 224 return p.maxNumberOfWorkers 225 } 226 227 // BoostWorkers returns the number of workers for a boost 228 func (p *WorkerPool) BoostWorkers() int { 229 p.lock.Lock() 230 defer p.lock.Unlock() 231 return p.boostWorkers 232 } 233 234 // BoostTimeout returns the timeout of the next boost 235 func (p *WorkerPool) BoostTimeout() time.Duration { 236 p.lock.Lock() 237 defer p.lock.Unlock() 238 return p.boostTimeout 239 } 240 241 // BlockTimeout returns the timeout til the next boost 242 func (p *WorkerPool) BlockTimeout() time.Duration { 243 p.lock.Lock() 244 defer p.lock.Unlock() 245 return p.blockTimeout 246 } 247 248 // SetPoolSettings sets the setable boost values 249 func (p *WorkerPool) SetPoolSettings(maxNumberOfWorkers, boostWorkers int, timeout time.Duration) { 250 p.lock.Lock() 251 defer p.lock.Unlock() 252 p.maxNumberOfWorkers = maxNumberOfWorkers 253 p.boostWorkers = boostWorkers 254 p.boostTimeout = timeout 255 } 256 257 // SetMaxNumberOfWorkers sets the maximum number of workers automatically added to the pool 258 // Changing this number will not change the number of current workers but will change the limit 259 // for future additions 260 func (p *WorkerPool) SetMaxNumberOfWorkers(newMax int) { 261 p.lock.Lock() 262 defer p.lock.Unlock() 263 p.maxNumberOfWorkers = newMax 264 } 265 266 func (p *WorkerPool) commonRegisterWorkers(number int, timeout time.Duration, isFlusher bool) (context.Context, context.CancelFunc) { 267 var ctx context.Context 268 var cancel context.CancelFunc 269 start := time.Now() 270 end := start 271 hasTimeout := false 272 if timeout > 0 { 273 ctx, cancel = context.WithTimeout(p.baseCtx, timeout) 274 end = start.Add(timeout) 275 hasTimeout = true 276 } else { 277 ctx, cancel = context.WithCancel(p.baseCtx) 278 } 279 280 mq := GetManager().GetManagedQueue(p.qid) 281 if mq != nil { 282 pid := mq.RegisterWorkers(number, start, hasTimeout, end, cancel, isFlusher) 283 log.Trace("WorkerPool: %d (for %s) adding %d workers with group id: %d", p.qid, mq.Name, number, pid) 284 return ctx, func() { 285 mq.RemoveWorkers(pid) 286 } 287 } 288 log.Trace("WorkerPool: %d adding %d workers (no group id)", p.qid, number) 289 290 return ctx, cancel 291 } 292 293 // AddWorkers adds workers to the pool - this allows the number of workers to go above the limit 294 func (p *WorkerPool) AddWorkers(number int, timeout time.Duration) context.CancelFunc { 295 ctx, cancel := p.commonRegisterWorkers(number, timeout, false) 296 p.addWorkers(ctx, cancel, number) 297 return cancel 298 } 299 300 // addWorkers adds workers to the pool 301 func (p *WorkerPool) addWorkers(ctx context.Context, cancel context.CancelFunc, number int) { 302 for i := 0; i < number; i++ { 303 p.lock.Lock() 304 if p.cond == nil { 305 p.cond = sync.NewCond(&p.lock) 306 } 307 p.numberOfWorkers++ 308 p.lock.Unlock() 309 go func() { 310 pprof.SetGoroutineLabels(ctx) 311 p.doWork(ctx) 312 313 p.lock.Lock() 314 p.numberOfWorkers-- 315 if p.numberOfWorkers == 0 { 316 p.cond.Broadcast() 317 cancel() 318 } else if p.numberOfWorkers < 0 { 319 // numberOfWorkers can't go negative but... 320 log.Warn("Number of Workers < 0 for QID %d - this shouldn't happen", p.qid) 321 p.numberOfWorkers = 0 322 p.cond.Broadcast() 323 cancel() 324 } 325 select { 326 case <-p.baseCtx.Done(): 327 // Don't warn or check for ongoing work if the baseCtx is shutdown 328 case <-p.paused: 329 // Don't warn or check for ongoing work if the pool is paused 330 default: 331 if p.hasNoWorkerScaling() { 332 log.Warn( 333 "Queue: %d is configured to be non-scaling and has no workers - this configuration is likely incorrect.\n"+ 334 "The queue will be paused to prevent data-loss with the assumption that you will add workers and unpause as required.", p.qid) 335 p.pause() 336 } else if p.numberOfWorkers == 0 && atomic.LoadInt64(&p.numInQueue) > 0 { 337 // OK there are no workers but... there's still work to be done -> Reboost 338 p.zeroBoost() 339 // p.lock will be unlocked by zeroBoost 340 return 341 } 342 } 343 p.lock.Unlock() 344 }() 345 } 346 } 347 348 // Wait for WorkerPool to finish 349 func (p *WorkerPool) Wait() { 350 p.lock.Lock() 351 defer p.lock.Unlock() 352 if p.cond == nil { 353 p.cond = sync.NewCond(&p.lock) 354 } 355 if p.numberOfWorkers <= 0 { 356 return 357 } 358 p.cond.Wait() 359 } 360 361 // IsPaused returns if the pool is paused 362 func (p *WorkerPool) IsPaused() bool { 363 p.lock.Lock() 364 defer p.lock.Unlock() 365 select { 366 case <-p.paused: 367 return true 368 default: 369 return false 370 } 371 } 372 373 // IsPausedIsResumed returns if the pool is paused and a channel that is closed when it is resumed 374 func (p *WorkerPool) IsPausedIsResumed() (<-chan struct{}, <-chan struct{}) { 375 p.lock.Lock() 376 defer p.lock.Unlock() 377 return p.paused, p.resumed 378 } 379 380 // Pause pauses the WorkerPool 381 func (p *WorkerPool) Pause() { 382 p.lock.Lock() 383 defer p.lock.Unlock() 384 p.pause() 385 } 386 387 func (p *WorkerPool) pause() { 388 select { 389 case <-p.paused: 390 default: 391 p.resumed = make(chan struct{}) 392 close(p.paused) 393 } 394 } 395 396 // Resume resumes the WorkerPool 397 func (p *WorkerPool) Resume() { 398 p.lock.Lock() // can't defer unlock because of the zeroBoost at the end 399 select { 400 case <-p.resumed: 401 // already resumed - there's nothing to do 402 p.lock.Unlock() 403 return 404 default: 405 } 406 407 p.paused = make(chan struct{}) 408 close(p.resumed) 409 410 // OK now we need to check if we need to add some workers... 411 if p.numberOfWorkers > 0 || p.hasNoWorkerScaling() || atomic.LoadInt64(&p.numInQueue) == 0 { 412 // We either have workers, can't scale or there's no work to be done -> so just resume 413 p.lock.Unlock() 414 return 415 } 416 417 // OK we got some work but no workers we need to think about boosting 418 select { 419 case <-p.baseCtx.Done(): 420 // don't bother boosting if the baseCtx is done 421 p.lock.Unlock() 422 return 423 default: 424 } 425 426 // OK we'd better add some boost workers! 427 p.zeroBoost() 428 // p.zeroBoost will unlock the lock 429 } 430 431 // CleanUp will drain the remaining contents of the channel 432 // This should be called after AddWorkers context is closed 433 func (p *WorkerPool) CleanUp(ctx context.Context) { 434 log.Trace("WorkerPool: %d CleanUp", p.qid) 435 close(p.dataChan) 436 for data := range p.dataChan { 437 if unhandled := p.handle(data); unhandled != nil { 438 if unhandled != nil { 439 log.Error("Unhandled Data in clean-up of queue %d", p.qid) 440 } 441 } 442 443 atomic.AddInt64(&p.numInQueue, -1) 444 select { 445 case <-ctx.Done(): 446 log.Warn("WorkerPool: %d Cleanup context closed before finishing clean-up", p.qid) 447 return 448 default: 449 } 450 } 451 log.Trace("WorkerPool: %d CleanUp Done", p.qid) 452 } 453 454 // Flush flushes the channel with a timeout - the Flush worker will be registered as a flush worker with the manager 455 func (p *WorkerPool) Flush(timeout time.Duration) error { 456 ctx, cancel := p.commonRegisterWorkers(1, timeout, true) 457 defer cancel() 458 return p.FlushWithContext(ctx) 459 } 460 461 // IsEmpty returns if true if the worker queue is empty 462 func (p *WorkerPool) IsEmpty() bool { 463 return atomic.LoadInt64(&p.numInQueue) == 0 464 } 465 466 // contextError returns either ctx.Done(), the base context's error or nil 467 func (p *WorkerPool) contextError(ctx context.Context) error { 468 select { 469 case <-p.baseCtx.Done(): 470 return p.baseCtx.Err() 471 case <-ctx.Done(): 472 return ctx.Err() 473 default: 474 return nil 475 } 476 } 477 478 // FlushWithContext is very similar to CleanUp but it will return as soon as the dataChan is empty 479 // NB: The worker will not be registered with the manager. 480 func (p *WorkerPool) FlushWithContext(ctx context.Context) error { 481 log.Trace("WorkerPool: %d Flush", p.qid) 482 paused, _ := p.IsPausedIsResumed() 483 for { 484 // Because select will return any case that is satisified at random we precheck here before looking at dataChan. 485 select { 486 case <-paused: 487 // Ensure that even if paused that the cancelled error is still sent 488 return p.contextError(ctx) 489 case <-p.baseCtx.Done(): 490 return p.baseCtx.Err() 491 case <-ctx.Done(): 492 return ctx.Err() 493 default: 494 } 495 496 select { 497 case <-paused: 498 return p.contextError(ctx) 499 case data, ok := <-p.dataChan: 500 if !ok { 501 return nil 502 } 503 if unhandled := p.handle(data); unhandled != nil { 504 log.Error("Unhandled Data whilst flushing queue %d", p.qid) 505 } 506 atomic.AddInt64(&p.numInQueue, -1) 507 case <-p.baseCtx.Done(): 508 return p.baseCtx.Err() 509 case <-ctx.Done(): 510 return ctx.Err() 511 default: 512 return nil 513 } 514 } 515 } 516 517 func (p *WorkerPool) doWork(ctx context.Context) { 518 pprof.SetGoroutineLabels(ctx) 519 delay := time.Millisecond * 300 520 521 // Create a common timer - we will use this elsewhere 522 timer := time.NewTimer(0) 523 util.StopTimer(timer) 524 525 paused, _ := p.IsPausedIsResumed() 526 data := make([]Data, 0, p.batchLength) 527 for { 528 // Because select will return any case that is satisified at random we precheck here before looking at dataChan. 529 select { 530 case <-paused: 531 log.Trace("Worker for Queue %d Pausing", p.qid) 532 if len(data) > 0 { 533 log.Trace("Handling: %d data, %v", len(data), data) 534 if unhandled := p.handle(data...); unhandled != nil { 535 log.Error("Unhandled Data in queue %d", p.qid) 536 } 537 atomic.AddInt64(&p.numInQueue, -1*int64(len(data))) 538 } 539 _, resumed := p.IsPausedIsResumed() 540 select { 541 case <-resumed: 542 paused, _ = p.IsPausedIsResumed() 543 log.Trace("Worker for Queue %d Resuming", p.qid) 544 util.StopTimer(timer) 545 case <-ctx.Done(): 546 log.Trace("Worker shutting down") 547 return 548 } 549 case <-ctx.Done(): 550 if len(data) > 0 { 551 log.Trace("Handling: %d data, %v", len(data), data) 552 if unhandled := p.handle(data...); unhandled != nil { 553 log.Error("Unhandled Data in queue %d", p.qid) 554 } 555 atomic.AddInt64(&p.numInQueue, -1*int64(len(data))) 556 } 557 log.Trace("Worker shutting down") 558 return 559 default: 560 } 561 562 select { 563 case <-paused: 564 // go back around 565 case <-ctx.Done(): 566 if len(data) > 0 { 567 log.Trace("Handling: %d data, %v", len(data), data) 568 if unhandled := p.handle(data...); unhandled != nil { 569 log.Error("Unhandled Data in queue %d", p.qid) 570 } 571 atomic.AddInt64(&p.numInQueue, -1*int64(len(data))) 572 } 573 log.Trace("Worker shutting down") 574 return 575 case datum, ok := <-p.dataChan: 576 if !ok { 577 // the dataChan has been closed - we should finish up: 578 if len(data) > 0 { 579 log.Trace("Handling: %d data, %v", len(data), data) 580 if unhandled := p.handle(data...); unhandled != nil { 581 log.Error("Unhandled Data in queue %d", p.qid) 582 } 583 atomic.AddInt64(&p.numInQueue, -1*int64(len(data))) 584 } 585 log.Trace("Worker shutting down") 586 return 587 } 588 data = append(data, datum) 589 util.StopTimer(timer) 590 591 if len(data) >= p.batchLength { 592 log.Trace("Handling: %d data, %v", len(data), data) 593 if unhandled := p.handle(data...); unhandled != nil { 594 log.Error("Unhandled Data in queue %d", p.qid) 595 } 596 atomic.AddInt64(&p.numInQueue, -1*int64(len(data))) 597 data = make([]Data, 0, p.batchLength) 598 } else { 599 timer.Reset(delay) 600 } 601 case <-timer.C: 602 delay = time.Millisecond * 100 603 if len(data) > 0 { 604 log.Trace("Handling: %d data, %v", len(data), data) 605 if unhandled := p.handle(data...); unhandled != nil { 606 log.Error("Unhandled Data in queue %d", p.qid) 607 } 608 atomic.AddInt64(&p.numInQueue, -1*int64(len(data))) 609 data = make([]Data, 0, p.batchLength) 610 } 611 } 612 } 613 }