github.com/grafana/pyroscope@v1.18.0/pkg/scheduler/queue/queue.go (about) 1 // SPDX-License-Identifier: AGPL-3.0-only 2 // Provenance-includes-location: https://github.com/cortexproject/cortex/blob/master/pkg/scheduler/queue/queue.go 3 // Provenance-includes-license: Apache-2.0 4 // Provenance-includes-copyright: The Cortex Authors. 5 6 package queue 7 8 import ( 9 "context" 10 "sync" 11 "time" 12 13 "github.com/grafana/dskit/services" 14 "github.com/pkg/errors" 15 "github.com/prometheus/client_golang/prometheus" 16 "go.uber.org/atomic" 17 ) 18 19 const ( 20 // How frequently to check for disconnected queriers that should be forgotten. 21 forgetCheckPeriod = 5 * time.Second 22 ) 23 24 var ( 25 ErrTooManyRequests = errors.New("too many outstanding requests") 26 ErrStopped = errors.New("queue is stopped") 27 ) 28 29 // UserIndex is opaque type that allows to resume iteration over users between successive calls 30 // of RequestQueue.GetNextRequestForQuerier method. 31 type UserIndex struct { 32 last int 33 } 34 35 // Modify index to start iteration on the same user, for which last queue was returned. 36 func (ui UserIndex) ReuseLastUser() UserIndex { 37 if ui.last >= 0 { 38 return UserIndex{last: ui.last - 1} 39 } 40 return ui 41 } 42 43 // FirstUser returns UserIndex that starts iteration over user queues from the very first user. 44 func FirstUser() UserIndex { 45 return UserIndex{last: -1} 46 } 47 48 // Request stored into the queue. 49 type Request interface{} 50 51 // RequestQueue holds incoming requests in per-user queues. It also assigns each user specified number of queriers, 52 // and when querier asks for next request to handle (using GetNextRequestForQuerier), it returns requests 53 // in a fair fashion. 54 type RequestQueue struct { 55 services.Service 56 57 connectedQuerierWorkers *atomic.Int32 58 59 mtx sync.Mutex 60 cond contextCond // Notified when request is enqueued or dequeued, or querier is disconnected. 61 queues *queues 62 stopped bool 63 64 queueLength *prometheus.GaugeVec // Per user and reason. 65 discardedRequests *prometheus.CounterVec // Per user. 66 } 67 68 func NewRequestQueue(maxOutstandingPerTenant int, forgetDelay time.Duration, queueLength *prometheus.GaugeVec, discardedRequests *prometheus.CounterVec) *RequestQueue { 69 q := &RequestQueue{ 70 queues: newUserQueues(maxOutstandingPerTenant, forgetDelay), 71 connectedQuerierWorkers: atomic.NewInt32(0), 72 queueLength: queueLength, 73 discardedRequests: discardedRequests, 74 } 75 76 q.cond = contextCond{Cond: sync.NewCond(&q.mtx)} 77 q.Service = services.NewTimerService(forgetCheckPeriod, nil, q.forgetDisconnectedQueriers, q.stopping).WithName("request queue") 78 79 return q 80 } 81 82 // EnqueueRequest puts the request into the queue. MaxQueries is user-specific value that specifies how many queriers can 83 // this user use (zero or negative = all queriers). It is passed to each EnqueueRequest, because it can change 84 // between calls. 85 // 86 // If request is successfully enqueued, successFn is called with the lock held, before any querier can receive the request. 87 func (q *RequestQueue) EnqueueRequest(userID string, req Request, maxQueriers int, successFn func()) error { 88 q.mtx.Lock() 89 defer q.mtx.Unlock() 90 91 if q.stopped { 92 return ErrStopped 93 } 94 95 queue := q.queues.getOrAddQueue(userID, maxQueriers) 96 if queue == nil { 97 // This can only happen if userID is "". 98 return errors.New("no queue found") 99 } 100 101 select { 102 case queue <- req: 103 q.queueLength.WithLabelValues(userID).Inc() 104 q.cond.Broadcast() 105 // Call this function while holding a lock. This guarantees that no querier can fetch the request before function returns. 106 if successFn != nil { 107 successFn() 108 } 109 return nil 110 default: 111 q.discardedRequests.WithLabelValues(userID).Inc() 112 return ErrTooManyRequests 113 } 114 } 115 116 // GetNextRequestForQuerier find next user queue and takes the next request off of it. Will block if there are no requests. 117 // By passing user index from previous call of this method, querier guarantees that it iterates over all users fairly. 118 // If querier finds that request from the user is already expired, it can get a request for the same user by using UserIndex.ReuseLastUser. 119 func (q *RequestQueue) GetNextRequestForQuerier(ctx context.Context, last UserIndex, querierID string) (Request, UserIndex, error) { 120 q.mtx.Lock() 121 defer q.mtx.Unlock() 122 123 querierWait := false 124 125 FindQueue: 126 // We need to wait if there are no users, or no pending requests for given querier. 127 for (q.queues.len() == 0 || querierWait) && ctx.Err() == nil && !q.stopped { 128 querierWait = false 129 q.cond.Wait(ctx) 130 } 131 132 if q.stopped { 133 return nil, last, ErrStopped 134 } 135 136 if err := ctx.Err(); err != nil { 137 return nil, last, err 138 } 139 140 for { 141 queue, userID, idx := q.queues.getNextQueueForQuerier(last.last, querierID) 142 last.last = idx 143 if queue == nil { 144 break 145 } 146 147 // Pick next request from the queue. 148 for { 149 request := <-queue 150 if len(queue) == 0 { 151 q.queues.deleteQueue(userID) 152 } 153 154 q.queueLength.WithLabelValues(userID).Dec() 155 156 // Tell close() we've processed a request. 157 q.cond.Broadcast() 158 159 return request, last, nil 160 } 161 } 162 163 // There are no unexpired requests, so we can get back 164 // and wait for more requests. 165 querierWait = true 166 goto FindQueue 167 } 168 169 func (q *RequestQueue) forgetDisconnectedQueriers(_ context.Context) error { 170 q.mtx.Lock() 171 defer q.mtx.Unlock() 172 173 if q.queues.forgetDisconnectedQueriers(time.Now()) > 0 { 174 // We need to notify goroutines cause having removed some queriers 175 // may have caused a resharding. 176 q.cond.Broadcast() 177 } 178 179 return nil 180 } 181 182 func (q *RequestQueue) stopping(_ error) error { 183 q.mtx.Lock() 184 defer q.mtx.Unlock() 185 186 for q.queues.len() > 0 && q.connectedQuerierWorkers.Load() > 0 { 187 q.cond.Wait(context.Background()) 188 } 189 190 // Only stop after dispatching enqueued requests. 191 q.stopped = true 192 193 // If there are still goroutines in GetNextRequestForQuerier method, they get notified. 194 q.cond.Broadcast() 195 196 return nil 197 } 198 199 func (q *RequestQueue) RegisterQuerierConnection(querier string) { 200 q.connectedQuerierWorkers.Inc() 201 202 q.mtx.Lock() 203 defer q.mtx.Unlock() 204 q.queues.addQuerierConnection(querier) 205 } 206 207 func (q *RequestQueue) UnregisterQuerierConnection(querier string) { 208 q.connectedQuerierWorkers.Dec() 209 210 q.mtx.Lock() 211 defer q.mtx.Unlock() 212 q.queues.removeQuerierConnection(querier, time.Now()) 213 } 214 215 func (q *RequestQueue) NotifyQuerierShutdown(querierID string) { 216 q.mtx.Lock() 217 defer q.mtx.Unlock() 218 q.queues.notifyQuerierShutdown(querierID) 219 } 220 221 func (q *RequestQueue) GetConnectedQuerierWorkersMetric() float64 { 222 return float64(q.connectedQuerierWorkers.Load()) 223 } 224 225 // contextCond is a *sync.Cond with Wait() method overridden to support context-based waiting. 226 type contextCond struct { 227 *sync.Cond 228 229 // testHookBeforeWaiting is called before calling Cond.Wait() if it's not nil. 230 // Yes, it's ugly, but the http package settled jurisprudence: 231 // https://github.com/golang/go/blob/6178d25fc0b28724b1b5aec2b1b74fc06d9294c7/src/net/http/client.go#L596-L601 232 testHookBeforeWaiting func() 233 } 234 235 // Wait does c.cond.Wait() but will also return if the context provided is done. 236 // All the documentation of sync.Cond.Wait() applies, but it's especially important to remember that the mutex of 237 // the cond should be held while Wait() is called (and mutex will be held once it returns) 238 func (c contextCond) Wait(ctx context.Context) { 239 // "condWait" goroutine does q.cond.Wait() and signals through condWait channel. 240 condWait := make(chan struct{}) 241 go func() { 242 if c.testHookBeforeWaiting != nil { 243 c.testHookBeforeWaiting() 244 } 245 c.Cond.Wait() 246 close(condWait) 247 }() 248 249 // "waiting" goroutine: signals that the condWait goroutine has started waiting. 250 // Notice that a closed waiting channel implies that the goroutine above has started waiting 251 // (because it has unlocked the mutex), but the other way is not true: 252 // - condWait it may have unlocked and is waiting, but someone else locked the mutex faster than us: 253 // in this case that caller will eventually unlock, and we'll be able to enter here. 254 // - condWait called Wait(), unlocked, received a broadcast and locked again faster than we were able to lock here: 255 // in this case condWait channel will be closed, and this goroutine will be waiting until we unlock. 256 waiting := make(chan struct{}) 257 go func() { 258 c.L.Lock() 259 close(waiting) 260 c.L.Unlock() 261 }() 262 263 select { 264 case <-condWait: 265 // We don't know whether the waiting goroutine is done or not, but we don't care: 266 // it will be done once nobody is fighting for the mutex anymore. 267 case <-ctx.Done(): 268 // In order to avoid leaking the condWait goroutine, we can send a broadcast. 269 // Before sending the broadcast we need to make sure that condWait goroutine is already waiting (or has already waited). 270 select { 271 case <-condWait: 272 // No need to broadcast as q.cond.Wait() has returned already. 273 return 274 case <-waiting: 275 // q.cond.Wait() might be still waiting (or maybe not!), so we'll poke it just in case. 276 c.Broadcast() 277 } 278 279 // Make sure we are not waiting anymore, we need to do that before returning as the caller will need to unlock the mutex. 280 <-condWait 281 } 282 }