github.com/grafana/pyroscope@v1.18.0/pkg/scheduler/queue/queue.go (about)

     1  // SPDX-License-Identifier: AGPL-3.0-only
     2  // Provenance-includes-location: https://github.com/cortexproject/cortex/blob/master/pkg/scheduler/queue/queue.go
     3  // Provenance-includes-license: Apache-2.0
     4  // Provenance-includes-copyright: The Cortex Authors.
     5  
     6  package queue
     7  
     8  import (
     9  	"context"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/grafana/dskit/services"
    14  	"github.com/pkg/errors"
    15  	"github.com/prometheus/client_golang/prometheus"
    16  	"go.uber.org/atomic"
    17  )
    18  
    19  const (
    20  	// How frequently to check for disconnected queriers that should be forgotten.
    21  	forgetCheckPeriod = 5 * time.Second
    22  )
    23  
    24  var (
    25  	ErrTooManyRequests = errors.New("too many outstanding requests")
    26  	ErrStopped         = errors.New("queue is stopped")
    27  )
    28  
    29  // UserIndex is opaque type that allows to resume iteration over users between successive calls
    30  // of RequestQueue.GetNextRequestForQuerier method.
    31  type UserIndex struct {
    32  	last int
    33  }
    34  
    35  // Modify index to start iteration on the same user, for which last queue was returned.
    36  func (ui UserIndex) ReuseLastUser() UserIndex {
    37  	if ui.last >= 0 {
    38  		return UserIndex{last: ui.last - 1}
    39  	}
    40  	return ui
    41  }
    42  
    43  // FirstUser returns UserIndex that starts iteration over user queues from the very first user.
    44  func FirstUser() UserIndex {
    45  	return UserIndex{last: -1}
    46  }
    47  
    48  // Request stored into the queue.
    49  type Request interface{}
    50  
    51  // RequestQueue holds incoming requests in per-user queues. It also assigns each user specified number of queriers,
    52  // and when querier asks for next request to handle (using GetNextRequestForQuerier), it returns requests
    53  // in a fair fashion.
    54  type RequestQueue struct {
    55  	services.Service
    56  
    57  	connectedQuerierWorkers *atomic.Int32
    58  
    59  	mtx     sync.Mutex
    60  	cond    contextCond // Notified when request is enqueued or dequeued, or querier is disconnected.
    61  	queues  *queues
    62  	stopped bool
    63  
    64  	queueLength       *prometheus.GaugeVec   // Per user and reason.
    65  	discardedRequests *prometheus.CounterVec // Per user.
    66  }
    67  
    68  func NewRequestQueue(maxOutstandingPerTenant int, forgetDelay time.Duration, queueLength *prometheus.GaugeVec, discardedRequests *prometheus.CounterVec) *RequestQueue {
    69  	q := &RequestQueue{
    70  		queues:                  newUserQueues(maxOutstandingPerTenant, forgetDelay),
    71  		connectedQuerierWorkers: atomic.NewInt32(0),
    72  		queueLength:             queueLength,
    73  		discardedRequests:       discardedRequests,
    74  	}
    75  
    76  	q.cond = contextCond{Cond: sync.NewCond(&q.mtx)}
    77  	q.Service = services.NewTimerService(forgetCheckPeriod, nil, q.forgetDisconnectedQueriers, q.stopping).WithName("request queue")
    78  
    79  	return q
    80  }
    81  
    82  // EnqueueRequest puts the request into the queue. MaxQueries is user-specific value that specifies how many queriers can
    83  // this user use (zero or negative = all queriers). It is passed to each EnqueueRequest, because it can change
    84  // between calls.
    85  //
    86  // If request is successfully enqueued, successFn is called with the lock held, before any querier can receive the request.
    87  func (q *RequestQueue) EnqueueRequest(userID string, req Request, maxQueriers int, successFn func()) error {
    88  	q.mtx.Lock()
    89  	defer q.mtx.Unlock()
    90  
    91  	if q.stopped {
    92  		return ErrStopped
    93  	}
    94  
    95  	queue := q.queues.getOrAddQueue(userID, maxQueriers)
    96  	if queue == nil {
    97  		// This can only happen if userID is "".
    98  		return errors.New("no queue found")
    99  	}
   100  
   101  	select {
   102  	case queue <- req:
   103  		q.queueLength.WithLabelValues(userID).Inc()
   104  		q.cond.Broadcast()
   105  		// Call this function while holding a lock. This guarantees that no querier can fetch the request before function returns.
   106  		if successFn != nil {
   107  			successFn()
   108  		}
   109  		return nil
   110  	default:
   111  		q.discardedRequests.WithLabelValues(userID).Inc()
   112  		return ErrTooManyRequests
   113  	}
   114  }
   115  
   116  // GetNextRequestForQuerier find next user queue and takes the next request off of it. Will block if there are no requests.
   117  // By passing user index from previous call of this method, querier guarantees that it iterates over all users fairly.
   118  // If querier finds that request from the user is already expired, it can get a request for the same user by using UserIndex.ReuseLastUser.
   119  func (q *RequestQueue) GetNextRequestForQuerier(ctx context.Context, last UserIndex, querierID string) (Request, UserIndex, error) {
   120  	q.mtx.Lock()
   121  	defer q.mtx.Unlock()
   122  
   123  	querierWait := false
   124  
   125  FindQueue:
   126  	// We need to wait if there are no users, or no pending requests for given querier.
   127  	for (q.queues.len() == 0 || querierWait) && ctx.Err() == nil && !q.stopped {
   128  		querierWait = false
   129  		q.cond.Wait(ctx)
   130  	}
   131  
   132  	if q.stopped {
   133  		return nil, last, ErrStopped
   134  	}
   135  
   136  	if err := ctx.Err(); err != nil {
   137  		return nil, last, err
   138  	}
   139  
   140  	for {
   141  		queue, userID, idx := q.queues.getNextQueueForQuerier(last.last, querierID)
   142  		last.last = idx
   143  		if queue == nil {
   144  			break
   145  		}
   146  
   147  		// Pick next request from the queue.
   148  		for {
   149  			request := <-queue
   150  			if len(queue) == 0 {
   151  				q.queues.deleteQueue(userID)
   152  			}
   153  
   154  			q.queueLength.WithLabelValues(userID).Dec()
   155  
   156  			// Tell close() we've processed a request.
   157  			q.cond.Broadcast()
   158  
   159  			return request, last, nil
   160  		}
   161  	}
   162  
   163  	// There are no unexpired requests, so we can get back
   164  	// and wait for more requests.
   165  	querierWait = true
   166  	goto FindQueue
   167  }
   168  
   169  func (q *RequestQueue) forgetDisconnectedQueriers(_ context.Context) error {
   170  	q.mtx.Lock()
   171  	defer q.mtx.Unlock()
   172  
   173  	if q.queues.forgetDisconnectedQueriers(time.Now()) > 0 {
   174  		// We need to notify goroutines cause having removed some queriers
   175  		// may have caused a resharding.
   176  		q.cond.Broadcast()
   177  	}
   178  
   179  	return nil
   180  }
   181  
   182  func (q *RequestQueue) stopping(_ error) error {
   183  	q.mtx.Lock()
   184  	defer q.mtx.Unlock()
   185  
   186  	for q.queues.len() > 0 && q.connectedQuerierWorkers.Load() > 0 {
   187  		q.cond.Wait(context.Background())
   188  	}
   189  
   190  	// Only stop after dispatching enqueued requests.
   191  	q.stopped = true
   192  
   193  	// If there are still goroutines in GetNextRequestForQuerier method, they get notified.
   194  	q.cond.Broadcast()
   195  
   196  	return nil
   197  }
   198  
   199  func (q *RequestQueue) RegisterQuerierConnection(querier string) {
   200  	q.connectedQuerierWorkers.Inc()
   201  
   202  	q.mtx.Lock()
   203  	defer q.mtx.Unlock()
   204  	q.queues.addQuerierConnection(querier)
   205  }
   206  
   207  func (q *RequestQueue) UnregisterQuerierConnection(querier string) {
   208  	q.connectedQuerierWorkers.Dec()
   209  
   210  	q.mtx.Lock()
   211  	defer q.mtx.Unlock()
   212  	q.queues.removeQuerierConnection(querier, time.Now())
   213  }
   214  
   215  func (q *RequestQueue) NotifyQuerierShutdown(querierID string) {
   216  	q.mtx.Lock()
   217  	defer q.mtx.Unlock()
   218  	q.queues.notifyQuerierShutdown(querierID)
   219  }
   220  
   221  func (q *RequestQueue) GetConnectedQuerierWorkersMetric() float64 {
   222  	return float64(q.connectedQuerierWorkers.Load())
   223  }
   224  
   225  // contextCond is a *sync.Cond with Wait() method overridden to support context-based waiting.
   226  type contextCond struct {
   227  	*sync.Cond
   228  
   229  	// testHookBeforeWaiting is called before calling Cond.Wait() if it's not nil.
   230  	// Yes, it's ugly, but the http package settled jurisprudence:
   231  	// https://github.com/golang/go/blob/6178d25fc0b28724b1b5aec2b1b74fc06d9294c7/src/net/http/client.go#L596-L601
   232  	testHookBeforeWaiting func()
   233  }
   234  
   235  // Wait does c.cond.Wait() but will also return if the context provided is done.
   236  // All the documentation of sync.Cond.Wait() applies, but it's especially important to remember that the mutex of
   237  // the cond should be held while Wait() is called (and mutex will be held once it returns)
   238  func (c contextCond) Wait(ctx context.Context) {
   239  	// "condWait" goroutine does q.cond.Wait() and signals through condWait channel.
   240  	condWait := make(chan struct{})
   241  	go func() {
   242  		if c.testHookBeforeWaiting != nil {
   243  			c.testHookBeforeWaiting()
   244  		}
   245  		c.Cond.Wait()
   246  		close(condWait)
   247  	}()
   248  
   249  	// "waiting" goroutine: signals that the condWait goroutine has started waiting.
   250  	// Notice that a closed waiting channel implies that the goroutine above has started waiting
   251  	// (because it has unlocked the mutex), but the other way is not true:
   252  	// - condWait it may have unlocked and is waiting, but someone else locked the mutex faster than us:
   253  	//   in this case that caller will eventually unlock, and we'll be able to enter here.
   254  	// - condWait called Wait(), unlocked, received a broadcast and locked again faster than we were able to lock here:
   255  	//   in this case condWait channel will be closed, and this goroutine will be waiting until we unlock.
   256  	waiting := make(chan struct{})
   257  	go func() {
   258  		c.L.Lock()
   259  		close(waiting)
   260  		c.L.Unlock()
   261  	}()
   262  
   263  	select {
   264  	case <-condWait:
   265  		// We don't know whether the waiting goroutine is done or not, but we don't care:
   266  		// it will be done once nobody is fighting for the mutex anymore.
   267  	case <-ctx.Done():
   268  		// In order to avoid leaking the condWait goroutine, we can send a broadcast.
   269  		// Before sending the broadcast we need to make sure that condWait goroutine is already waiting (or has already waited).
   270  		select {
   271  		case <-condWait:
   272  			// No need to broadcast as q.cond.Wait() has returned already.
   273  			return
   274  		case <-waiting:
   275  			// q.cond.Wait() might be still waiting (or maybe not!), so we'll poke it just in case.
   276  			c.Broadcast()
   277  		}
   278  
   279  		// Make sure we are not waiting anymore, we need to do that before returning as the caller will need to unlock the mutex.
   280  		<-condWait
   281  	}
   282  }