github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/scheduler/queue/queue.go (about) 1 package queue 2 3 import ( 4 "context" 5 "sync" 6 "time" 7 8 "github.com/grafana/dskit/services" 9 "github.com/pkg/errors" 10 "github.com/prometheus/client_golang/prometheus" 11 "go.uber.org/atomic" 12 ) 13 14 const ( 15 // How frequently to check for disconnected queriers that should be forgotten. 16 forgetCheckPeriod = 5 * time.Second 17 ) 18 19 var ( 20 ErrTooManyRequests = errors.New("too many outstanding requests") 21 ErrStopped = errors.New("queue is stopped") 22 ) 23 24 // UserIndex is opaque type that allows to resume iteration over users between successive calls 25 // of RequestQueue.GetNextRequestForQuerier method. 26 type UserIndex struct { 27 last int 28 } 29 30 // Modify index to start iteration on the same user, for which last queue was returned. 31 func (ui UserIndex) ReuseLastUser() UserIndex { 32 if ui.last >= 0 { 33 return UserIndex{last: ui.last - 1} 34 } 35 return ui 36 } 37 38 // FirstUser returns UserIndex that starts iteration over user queues from the very first user. 39 func FirstUser() UserIndex { 40 return UserIndex{last: -1} 41 } 42 43 // Request stored into the queue. 44 type Request interface{} 45 46 // RequestQueue holds incoming requests in per-user queues. It also assigns each user specified number of queriers, 47 // and when querier asks for next request to handle (using GetNextRequestForQuerier), it returns requests 48 // in a fair fashion. 49 type RequestQueue struct { 50 services.Service 51 52 connectedQuerierWorkers *atomic.Int32 53 54 mtx sync.Mutex 55 cond contextCond // Notified when request is enqueued or dequeued, or querier is disconnected. 56 queues *queues 57 stopped bool 58 59 queueLength *prometheus.GaugeVec // Per user and reason. 60 discardedRequests *prometheus.CounterVec // Per user. 61 } 62 63 func NewRequestQueue(maxOutstandingPerTenant int, forgetDelay time.Duration, queueLength *prometheus.GaugeVec, discardedRequests *prometheus.CounterVec) *RequestQueue { 64 q := &RequestQueue{ 65 queues: newUserQueues(maxOutstandingPerTenant, forgetDelay), 66 connectedQuerierWorkers: atomic.NewInt32(0), 67 queueLength: queueLength, 68 discardedRequests: discardedRequests, 69 } 70 71 q.cond = contextCond{Cond: sync.NewCond(&q.mtx)} 72 q.Service = services.NewTimerService(forgetCheckPeriod, nil, q.forgetDisconnectedQueriers, q.stopping).WithName("request queue") 73 74 return q 75 } 76 77 // EnqueueRequest puts the request into the queue. MaxQueries is user-specific value that specifies how many queriers can 78 // this user use (zero or negative = all queriers). It is passed to each EnqueueRequest, because it can change 79 // between calls. 80 // 81 // If request is successfully enqueued, successFn is called with the lock held, before any querier can receive the request. 82 func (q *RequestQueue) EnqueueRequest(userID string, req Request, maxQueriers int, successFn func()) error { 83 q.mtx.Lock() 84 defer q.mtx.Unlock() 85 86 if q.stopped { 87 return ErrStopped 88 } 89 90 queue := q.queues.getOrAddQueue(userID, maxQueriers) 91 if queue == nil { 92 // This can only happen if userID is "". 93 return errors.New("no queue found") 94 } 95 96 select { 97 case queue <- req: 98 q.queueLength.WithLabelValues(userID).Inc() 99 q.cond.Broadcast() 100 // Call this function while holding a lock. This guarantees that no querier can fetch the request before function returns. 101 if successFn != nil { 102 successFn() 103 } 104 return nil 105 default: 106 q.discardedRequests.WithLabelValues(userID).Inc() 107 return ErrTooManyRequests 108 } 109 } 110 111 // GetNextRequestForQuerier find next user queue and takes the next request off of it. Will block if there are no requests. 112 // By passing user index from previous call of this method, querier guarantees that it iterates over all users fairly. 113 // If querier finds that request from the user is already expired, it can get a request for the same user by using UserIndex.ReuseLastUser. 114 func (q *RequestQueue) GetNextRequestForQuerier(ctx context.Context, last UserIndex, querierID string) (Request, UserIndex, error) { 115 q.mtx.Lock() 116 defer q.mtx.Unlock() 117 118 querierWait := false 119 120 FindQueue: 121 // We need to wait if there are no users, or no pending requests for given querier. 122 for (q.queues.len() == 0 || querierWait) && ctx.Err() == nil && !q.stopped { 123 querierWait = false 124 q.cond.Wait(ctx) 125 } 126 127 if q.stopped { 128 return nil, last, ErrStopped 129 } 130 131 if err := ctx.Err(); err != nil { 132 return nil, last, err 133 } 134 135 for { 136 queue, userID, idx := q.queues.getNextQueueForQuerier(last.last, querierID) 137 last.last = idx 138 if queue == nil { 139 break 140 } 141 142 // Pick next request from the queue. 143 for { 144 request := <-queue 145 if len(queue) == 0 { 146 q.queues.deleteQueue(userID) 147 } 148 149 q.queueLength.WithLabelValues(userID).Dec() 150 151 // Tell close() we've processed a request. 152 q.cond.Broadcast() 153 154 return request, last, nil 155 } 156 } 157 158 // There are no unexpired requests, so we can get back 159 // and wait for more requests. 160 querierWait = true 161 goto FindQueue 162 } 163 164 func (q *RequestQueue) forgetDisconnectedQueriers(_ context.Context) error { 165 q.mtx.Lock() 166 defer q.mtx.Unlock() 167 168 if q.queues.forgetDisconnectedQueriers(time.Now()) > 0 { 169 // We need to notify goroutines cause having removed some queriers 170 // may have caused a resharding. 171 q.cond.Broadcast() 172 } 173 174 return nil 175 } 176 177 func (q *RequestQueue) stopping(_ error) error { 178 q.mtx.Lock() 179 defer q.mtx.Unlock() 180 181 for q.queues.len() > 0 && q.connectedQuerierWorkers.Load() > 0 { 182 q.cond.Wait(context.Background()) 183 } 184 185 // Only stop after dispatching enqueued requests. 186 q.stopped = true 187 188 // If there are still goroutines in GetNextRequestForQuerier method, they get notified. 189 q.cond.Broadcast() 190 191 return nil 192 } 193 194 func (q *RequestQueue) RegisterQuerierConnection(querier string) { 195 q.connectedQuerierWorkers.Inc() 196 197 q.mtx.Lock() 198 defer q.mtx.Unlock() 199 q.queues.addQuerierConnection(querier) 200 } 201 202 func (q *RequestQueue) UnregisterQuerierConnection(querier string) { 203 q.connectedQuerierWorkers.Dec() 204 205 q.mtx.Lock() 206 defer q.mtx.Unlock() 207 q.queues.removeQuerierConnection(querier, time.Now()) 208 } 209 210 func (q *RequestQueue) NotifyQuerierShutdown(querierID string) { 211 q.mtx.Lock() 212 defer q.mtx.Unlock() 213 q.queues.notifyQuerierShutdown(querierID) 214 } 215 216 func (q *RequestQueue) GetConnectedQuerierWorkersMetric() float64 { 217 return float64(q.connectedQuerierWorkers.Load()) 218 } 219 220 // contextCond is a *sync.Cond with Wait() method overridden to support context-based waiting. 221 type contextCond struct { 222 *sync.Cond 223 224 // testHookBeforeWaiting is called before calling Cond.Wait() if it's not nil. 225 // Yes, it's ugly, but the http package settled jurisprudence: 226 // https://github.com/golang/go/blob/6178d25fc0b28724b1b5aec2b1b74fc06d9294c7/src/net/http/client.go#L596-L601 227 testHookBeforeWaiting func() 228 } 229 230 // Wait does c.cond.Wait() but will also return if the context provided is done. 231 // All the documentation of sync.Cond.Wait() applies, but it's especially important to remember that the mutex of 232 // the cond should be held while Wait() is called (and mutex will be held once it returns) 233 func (c contextCond) Wait(ctx context.Context) { 234 // "condWait" goroutine does q.cond.Wait() and signals through condWait channel. 235 condWait := make(chan struct{}) 236 go func() { 237 if c.testHookBeforeWaiting != nil { 238 c.testHookBeforeWaiting() 239 } 240 c.Cond.Wait() 241 close(condWait) 242 }() 243 244 // "waiting" goroutine: signals that the condWait goroutine has started waiting. 245 // Notice that a closed waiting channel implies that the goroutine above has started waiting 246 // (because it has unlocked the mutex), but the other way is not true: 247 // - condWait it may have unlocked and is waiting, but someone else locked the mutex faster than us: 248 // in this case that caller will eventually unlock, and we'll be able to enter here. 249 // - condWait called Wait(), unlocked, received a broadcast and locked again faster than we were able to lock here: 250 // in this case condWait channel will be closed, and this goroutine will be waiting until we unlock. 251 waiting := make(chan struct{}) 252 go func() { 253 c.L.Lock() 254 close(waiting) 255 c.L.Unlock() 256 }() 257 258 select { 259 case <-condWait: 260 // We don't know whether the waiting goroutine is done or not, but we don't care: 261 // it will be done once nobody is fighting for the mutex anymore. 262 case <-ctx.Done(): 263 // In order to avoid leaking the condWait goroutine, we can send a broadcast. 264 // Before sending the broadcast we need to make sure that condWait goroutine is already waiting (or has already waited). 265 select { 266 case <-condWait: 267 // No need to broadcast as q.cond.Wait() has returned already. 268 return 269 case <-waiting: 270 // q.cond.Wait() might be still waiting (or maybe not!), so we'll poke it just in case. 271 c.Broadcast() 272 } 273 274 // Make sure we are not waiting anymore, we need to do that before returning as the caller will need to unlock the mutex. 275 <-condWait 276 } 277 }