code.gitea.io/gitea@v1.22.3/modules/queue/workergroup.go (about) 1 // Copyright 2023 The Gitea Authors. All rights reserved. 2 // SPDX-License-Identifier: MIT 3 4 package queue 5 6 import ( 7 "context" 8 "runtime/pprof" 9 "sync" 10 "sync/atomic" 11 "time" 12 13 "code.gitea.io/gitea/modules/log" 14 ) 15 16 var ( 17 infiniteTimerC = make(chan time.Time) 18 batchDebounceDuration = 100 * time.Millisecond 19 workerIdleDuration = 1 * time.Second 20 shutdownDefaultTimeout = 2 * time.Second 21 22 unhandledItemRequeueDuration atomic.Int64 // to avoid data race during test 23 ) 24 25 func init() { 26 unhandledItemRequeueDuration.Store(int64(5 * time.Second)) 27 } 28 29 // workerGroup is a group of workers to work with a WorkerPoolQueue 30 type workerGroup[T any] struct { 31 q *WorkerPoolQueue[T] 32 wg sync.WaitGroup 33 34 ctxWorker context.Context 35 ctxWorkerCancel context.CancelFunc 36 37 batchBuffer []T 38 popItemChan chan []byte 39 popItemErr chan error 40 } 41 42 func (wg *workerGroup[T]) doPrepareWorkerContext() { 43 wg.ctxWorker, wg.ctxWorkerCancel = context.WithCancel(wg.q.ctxRun) 44 } 45 46 // doDispatchBatchToWorker dispatches a batch of items to worker's channel. 47 // If the channel is full, it tries to start a new worker if possible. 48 func (q *WorkerPoolQueue[T]) doDispatchBatchToWorker(wg *workerGroup[T], flushChan chan flushType) { 49 batch := wg.batchBuffer 50 wg.batchBuffer = nil 51 52 if len(batch) == 0 { 53 return 54 } 55 56 full := false 57 select { 58 case q.batchChan <- batch: 59 default: 60 full = true 61 } 62 63 // TODO: the logic could be improved in the future, to avoid a data-race between "doStartNewWorker" and "workerNum" 64 // The root problem is that if we skip "doStartNewWorker" here, the "workerNum" might be decreased by other workers later 65 // So ideally, it should check whether there are enough workers by some approaches, and start new workers if necessary. 66 // This data-race is not serious, as long as a new worker will be started soon to make sure there are enough workers, 67 // so no need to hugely refactor at the moment. 68 q.workerNumMu.Lock() 69 noWorker := q.workerNum == 0 70 if full || noWorker { 71 if q.workerNum < q.workerMaxNum || noWorker && q.workerMaxNum <= 0 { 72 q.workerNum++ 73 q.doStartNewWorker(wg) 74 } 75 } 76 q.workerNumMu.Unlock() 77 78 if full { 79 select { 80 case q.batchChan <- batch: 81 case flush := <-flushChan: 82 q.doWorkerHandle(batch) 83 q.doFlush(wg, flush) 84 case <-q.ctxRun.Done(): 85 wg.batchBuffer = batch // return the batch to buffer, the "doRun" function will handle it 86 } 87 } 88 } 89 90 // doWorkerHandle calls the safeHandler to handle a batch of items, and it increases/decreases the active worker number. 91 // If the context has been canceled, it should not be caller because the "Push" still needs the context, in such case, call q.safeHandler directly 92 func (q *WorkerPoolQueue[T]) doWorkerHandle(batch []T) { 93 q.workerNumMu.Lock() 94 q.workerActiveNum++ 95 q.workerNumMu.Unlock() 96 97 defer func() { 98 q.workerNumMu.Lock() 99 q.workerActiveNum-- 100 q.workerNumMu.Unlock() 101 }() 102 103 unhandled := q.safeHandler(batch...) 104 // if none of the items were handled, it should back-off for a few seconds 105 // in this case the handler (eg: document indexer) may have encountered some errors/failures 106 if len(unhandled) == len(batch) && unhandledItemRequeueDuration.Load() != 0 { 107 log.Error("Queue %q failed to handle batch of %d items, backoff for a few seconds", q.GetName(), len(batch)) 108 select { 109 case <-q.ctxRun.Done(): 110 case <-time.After(time.Duration(unhandledItemRequeueDuration.Load())): 111 } 112 } 113 for _, item := range unhandled { 114 if err := q.Push(item); err != nil { 115 if !q.basePushForShutdown(item) { 116 log.Error("Failed to requeue item for queue %q when calling handler: %v", q.GetName(), err) 117 } 118 } 119 } 120 } 121 122 // basePushForShutdown tries to requeue items into the base queue when the WorkerPoolQueue is shutting down. 123 // If the queue is shutting down, it returns true and try to push the items 124 // Otherwise it does nothing and returns false 125 func (q *WorkerPoolQueue[T]) basePushForShutdown(items ...T) bool { 126 shutdownTimeout := time.Duration(q.shutdownTimeout.Load()) 127 if shutdownTimeout == 0 { 128 return false 129 } 130 ctxShutdown, ctxShutdownCancel := context.WithTimeout(context.Background(), shutdownTimeout) 131 defer ctxShutdownCancel() 132 for _, item := range items { 133 // if there is still any error, the queue can do nothing instead of losing the items 134 if err := q.baseQueue.PushItem(ctxShutdown, q.marshal(item)); err != nil { 135 log.Error("Failed to requeue item for queue %q when shutting down: %v", q.GetName(), err) 136 } 137 } 138 return true 139 } 140 141 func resetIdleTicker(t *time.Ticker, dur time.Duration) { 142 t.Reset(dur) 143 select { 144 case <-t.C: 145 default: 146 } 147 } 148 149 // doStartNewWorker starts a new worker for the queue, the worker reads from worker's channel and handles the items. 150 func (q *WorkerPoolQueue[T]) doStartNewWorker(wp *workerGroup[T]) { 151 wp.wg.Add(1) 152 153 go func() { 154 defer wp.wg.Done() 155 156 log.Debug("Queue %q starts new worker", q.GetName()) 157 defer log.Debug("Queue %q stops idle worker", q.GetName()) 158 159 t := time.NewTicker(workerIdleDuration) 160 defer t.Stop() 161 162 keepWorking := true 163 stopWorking := func() { 164 q.workerNumMu.Lock() 165 keepWorking = false 166 q.workerNum-- 167 q.workerNumMu.Unlock() 168 } 169 for keepWorking { 170 select { 171 case <-wp.ctxWorker.Done(): 172 stopWorking() 173 case batch, ok := <-q.batchChan: 174 if !ok { 175 stopWorking() 176 continue 177 } 178 q.doWorkerHandle(batch) 179 // reset the idle ticker, and drain the tick after reset in case a tick is already triggered 180 resetIdleTicker(t, workerIdleDuration) // key code for TestWorkerPoolQueueWorkerIdleReset 181 case <-t.C: 182 q.workerNumMu.Lock() 183 keepWorking = q.workerNum <= 1 // keep the last worker running 184 if !keepWorking { 185 q.workerNum-- 186 } 187 q.workerNumMu.Unlock() 188 } 189 } 190 }() 191 } 192 193 // doFlush flushes the queue: it tries to read all items from the queue and handles them. 194 // It is for testing purpose only. It's not designed to work for a cluster. 195 func (q *WorkerPoolQueue[T]) doFlush(wg *workerGroup[T], flush flushType) { 196 log.Debug("Queue %q starts flushing", q.GetName()) 197 defer log.Debug("Queue %q finishes flushing", q.GetName()) 198 199 // stop all workers, and prepare a new worker context to start new workers 200 201 wg.ctxWorkerCancel() 202 wg.wg.Wait() 203 204 defer func() { 205 close(flush) 206 wg.doPrepareWorkerContext() 207 }() 208 209 // drain the batch channel first 210 loop: 211 for { 212 select { 213 case batch := <-q.batchChan: 214 q.doWorkerHandle(batch) 215 default: 216 break loop 217 } 218 } 219 220 // drain the popItem channel 221 emptyCounter := 0 222 for { 223 select { 224 case data, dataOk := <-wg.popItemChan: 225 if !dataOk { 226 return 227 } 228 emptyCounter = 0 229 if v, jsonOk := q.unmarshal(data); !jsonOk { 230 continue 231 } else { 232 q.doWorkerHandle([]T{v}) 233 } 234 case err := <-wg.popItemErr: 235 if !q.isCtxRunCanceled() { 236 log.Error("Failed to pop item from queue %q (doFlush): %v", q.GetName(), err) 237 } 238 return 239 case <-q.ctxRun.Done(): 240 log.Debug("Queue %q is shutting down", q.GetName()) 241 return 242 case <-time.After(20 * time.Millisecond): 243 // There is no reliable way to make sure all queue items are consumed by the Flush, there always might be some items stored in some buffers/temp variables. 244 // If we run Gitea in a cluster, we can even not guarantee all items are consumed in a deterministic instance. 245 // Luckily, the "Flush" trick is only used in tests, so far so good. 246 if cnt, _ := q.baseQueue.Len(q.ctxRun); cnt == 0 && len(wg.popItemChan) == 0 { 247 emptyCounter++ 248 } 249 if emptyCounter >= 2 { 250 return 251 } 252 } 253 } 254 } 255 256 func (q *WorkerPoolQueue[T]) isCtxRunCanceled() bool { 257 select { 258 case <-q.ctxRun.Done(): 259 return true 260 default: 261 return false 262 } 263 } 264 265 var skipFlushChan = make(chan flushType) // an empty flush chan, used to skip reading other flush requests 266 267 // doRun is the main loop of the queue. All related "doXxx" functions are executed in its context. 268 func (q *WorkerPoolQueue[T]) doRun() { 269 pprof.SetGoroutineLabels(q.ctxRun) 270 271 log.Debug("Queue %q starts running", q.GetName()) 272 defer log.Debug("Queue %q stops running", q.GetName()) 273 274 wg := &workerGroup[T]{q: q} 275 wg.doPrepareWorkerContext() 276 wg.popItemChan, wg.popItemErr = popItemByChan(q.ctxRun, q.baseQueue.PopItem) 277 278 defer func() { 279 q.ctxRunCancel() 280 281 // drain all data on the fly 282 // since the queue is shutting down, the items can't be dispatched to workers because the context is canceled 283 // it can't call doWorkerHandle either, because there is no chance to push unhandled items back to the queue 284 var unhandled []T 285 close(q.batchChan) 286 for batch := range q.batchChan { 287 unhandled = append(unhandled, batch...) 288 } 289 unhandled = append(unhandled, wg.batchBuffer...) 290 for data := range wg.popItemChan { 291 if v, ok := q.unmarshal(data); ok { 292 unhandled = append(unhandled, v) 293 } 294 } 295 296 shutdownTimeout := time.Duration(q.shutdownTimeout.Load()) 297 if shutdownTimeout != 0 { 298 // if there is a shutdown context, try to push the items back to the base queue 299 q.basePushForShutdown(unhandled...) 300 workerDone := make(chan struct{}) 301 // the only way to wait for the workers, because the handlers do not have context to wait for 302 go func() { wg.wg.Wait(); close(workerDone) }() 303 select { 304 case <-workerDone: 305 case <-time.After(shutdownTimeout): 306 log.Error("Queue %q is shutting down, but workers are still running after timeout", q.GetName()) 307 } 308 } else { 309 // if there is no shutdown context, just call the handler to try to handle the items. if the handler fails again, the items are lost 310 q.safeHandler(unhandled...) 311 } 312 313 close(q.shutdownDone) 314 }() 315 316 var batchDispatchC <-chan time.Time = infiniteTimerC 317 for { 318 select { 319 case data, dataOk := <-wg.popItemChan: 320 if !dataOk { 321 return 322 } 323 if v, jsonOk := q.unmarshal(data); !jsonOk { 324 testRecorder.Record("pop:corrupted:%s", data) // in rare cases the levelqueue(leveldb) might be corrupted 325 continue 326 } else { 327 wg.batchBuffer = append(wg.batchBuffer, v) 328 } 329 if len(wg.batchBuffer) >= q.batchLength { 330 q.doDispatchBatchToWorker(wg, q.flushChan) 331 } else if batchDispatchC == infiniteTimerC { 332 batchDispatchC = time.After(batchDebounceDuration) 333 } // else: batchDispatchC is already a debounce timer, it will be triggered soon 334 case <-batchDispatchC: 335 batchDispatchC = infiniteTimerC 336 q.doDispatchBatchToWorker(wg, q.flushChan) 337 case flush := <-q.flushChan: 338 // before flushing, it needs to try to dispatch the batch to worker first, in case there is no worker running 339 // after the flushing, there is at least one worker running, so "doFlush" could wait for workers to finish 340 // since we are already in a "flush" operation, so the dispatching function shouldn't read the flush chan. 341 q.doDispatchBatchToWorker(wg, skipFlushChan) 342 q.doFlush(wg, flush) 343 case err := <-wg.popItemErr: 344 if !q.isCtxRunCanceled() { 345 log.Error("Failed to pop item from queue %q (doRun): %v", q.GetName(), err) 346 } 347 return 348 case <-q.ctxRun.Done(): 349 log.Debug("Queue %q is shutting down", q.GetName()) 350 return 351 } 352 } 353 }