github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/scheduler/queue/queue.go

github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/scheduler/queue/queue.go (about)

     1  package queue
     2  
     3  import (
     4  	"context"
     5  	"sync"
     6  	"time"
     7  
     8  	"github.com/grafana/dskit/services"
     9  	"github.com/pkg/errors"
    10  	"github.com/prometheus/client_golang/prometheus"
    11  	"go.uber.org/atomic"
    12  )
    13  
    14  const (
    15  	// How frequently to check for disconnected queriers that should be forgotten.
    16  	forgetCheckPeriod = 5 * time.Second
    17  )
    18  
    19  var (
    20  	ErrTooManyRequests = errors.New("too many outstanding requests")
    21  	ErrStopped         = errors.New("queue is stopped")
    22  )
    23  
    24  // UserIndex is opaque type that allows to resume iteration over users between successive calls
    25  // of RequestQueue.GetNextRequestForQuerier method.
    26  type UserIndex struct {
    27  	last int
    28  }
    29  
    30  // Modify index to start iteration on the same user, for which last queue was returned.
    31  func (ui UserIndex) ReuseLastUser() UserIndex {
    32  	if ui.last >= 0 {
    33  		return UserIndex{last: ui.last - 1}
    34  	}
    35  	return ui
    36  }
    37  
    38  // FirstUser returns UserIndex that starts iteration over user queues from the very first user.
    39  func FirstUser() UserIndex {
    40  	return UserIndex{last: -1}
    41  }
    42  
    43  // Request stored into the queue.
    44  type Request interface{}
    45  
    46  // RequestQueue holds incoming requests in per-user queues. It also assigns each user specified number of queriers,
    47  // and when querier asks for next request to handle (using GetNextRequestForQuerier), it returns requests
    48  // in a fair fashion.
    49  type RequestQueue struct {
    50  	services.Service
    51  
    52  	connectedQuerierWorkers *atomic.Int32
    53  
    54  	mtx     sync.Mutex
    55  	cond    contextCond // Notified when request is enqueued or dequeued, or querier is disconnected.
    56  	queues  *queues
    57  	stopped bool
    58  
    59  	queueLength       *prometheus.GaugeVec   // Per user and reason.
    60  	discardedRequests *prometheus.CounterVec // Per user.
    61  }
    62  
    63  func NewRequestQueue(maxOutstandingPerTenant int, forgetDelay time.Duration, queueLength *prometheus.GaugeVec, discardedRequests *prometheus.CounterVec) *RequestQueue {
    64  	q := &RequestQueue{
    65  		queues:                  newUserQueues(maxOutstandingPerTenant, forgetDelay),
    66  		connectedQuerierWorkers: atomic.NewInt32(0),
    67  		queueLength:             queueLength,
    68  		discardedRequests:       discardedRequests,
    69  	}
    70  
    71  	q.cond = contextCond{Cond: sync.NewCond(&q.mtx)}
    72  	q.Service = services.NewTimerService(forgetCheckPeriod, nil, q.forgetDisconnectedQueriers, q.stopping).WithName("request queue")
    73  
    74  	return q
    75  }
    76  
    77  // EnqueueRequest puts the request into the queue. MaxQueries is user-specific value that specifies how many queriers can
    78  // this user use (zero or negative = all queriers). It is passed to each EnqueueRequest, because it can change
    79  // between calls.
    80  //
    81  // If request is successfully enqueued, successFn is called with the lock held, before any querier can receive the request.
    82  func (q *RequestQueue) EnqueueRequest(userID string, req Request, maxQueriers int, successFn func()) error {
    83  	q.mtx.Lock()
    84  	defer q.mtx.Unlock()
    85  
    86  	if q.stopped {
    87  		return ErrStopped
    88  	}
    89  
    90  	queue := q.queues.getOrAddQueue(userID, maxQueriers)
    91  	if queue == nil {
    92  		// This can only happen if userID is "".
    93  		return errors.New("no queue found")
    94  	}
    95  
    96  	select {
    97  	case queue <- req:
    98  		q.queueLength.WithLabelValues(userID).Inc()
    99  		q.cond.Broadcast()
   100  		// Call this function while holding a lock. This guarantees that no querier can fetch the request before function returns.
   101  		if successFn != nil {
   102  			successFn()
   103  		}
   104  		return nil
   105  	default:
   106  		q.discardedRequests.WithLabelValues(userID).Inc()
   107  		return ErrTooManyRequests
   108  	}
   109  }
   110  
   111  // GetNextRequestForQuerier find next user queue and takes the next request off of it. Will block if there are no requests.
   112  // By passing user index from previous call of this method, querier guarantees that it iterates over all users fairly.
   113  // If querier finds that request from the user is already expired, it can get a request for the same user by using UserIndex.ReuseLastUser.
   114  func (q *RequestQueue) GetNextRequestForQuerier(ctx context.Context, last UserIndex, querierID string) (Request, UserIndex, error) {
   115  	q.mtx.Lock()
   116  	defer q.mtx.Unlock()
   117  
   118  	querierWait := false
   119  
   120  FindQueue:
   121  	// We need to wait if there are no users, or no pending requests for given querier.
   122  	for (q.queues.len() == 0 || querierWait) && ctx.Err() == nil && !q.stopped {
   123  		querierWait = false
   124  		q.cond.Wait(ctx)
   125  	}
   126  
   127  	if q.stopped {
   128  		return nil, last, ErrStopped
   129  	}
   130  
   131  	if err := ctx.Err(); err != nil {
   132  		return nil, last, err
   133  	}
   134  
   135  	for {
   136  		queue, userID, idx := q.queues.getNextQueueForQuerier(last.last, querierID)
   137  		last.last = idx
   138  		if queue == nil {
   139  			break
   140  		}
   141  
   142  		// Pick next request from the queue.
   143  		for {
   144  			request := <-queue
   145  			if len(queue) == 0 {
   146  				q.queues.deleteQueue(userID)
   147  			}
   148  
   149  			q.queueLength.WithLabelValues(userID).Dec()
   150  
   151  			// Tell close() we've processed a request.
   152  			q.cond.Broadcast()
   153  
   154  			return request, last, nil
   155  		}
   156  	}
   157  
   158  	// There are no unexpired requests, so we can get back
   159  	// and wait for more requests.
   160  	querierWait = true
   161  	goto FindQueue
   162  }
   163  
   164  func (q *RequestQueue) forgetDisconnectedQueriers(_ context.Context) error {
   165  	q.mtx.Lock()
   166  	defer q.mtx.Unlock()
   167  
   168  	if q.queues.forgetDisconnectedQueriers(time.Now()) > 0 {
   169  		// We need to notify goroutines cause having removed some queriers
   170  		// may have caused a resharding.
   171  		q.cond.Broadcast()
   172  	}
   173  
   174  	return nil
   175  }
   176  
   177  func (q *RequestQueue) stopping(_ error) error {
   178  	q.mtx.Lock()
   179  	defer q.mtx.Unlock()
   180  
   181  	for q.queues.len() > 0 && q.connectedQuerierWorkers.Load() > 0 {
   182  		q.cond.Wait(context.Background())
   183  	}
   184  
   185  	// Only stop after dispatching enqueued requests.
   186  	q.stopped = true
   187  
   188  	// If there are still goroutines in GetNextRequestForQuerier method, they get notified.
   189  	q.cond.Broadcast()
   190  
   191  	return nil
   192  }
   193  
   194  func (q *RequestQueue) RegisterQuerierConnection(querier string) {
   195  	q.connectedQuerierWorkers.Inc()
   196  
   197  	q.mtx.Lock()
   198  	defer q.mtx.Unlock()
   199  	q.queues.addQuerierConnection(querier)
   200  }
   201  
   202  func (q *RequestQueue) UnregisterQuerierConnection(querier string) {
   203  	q.connectedQuerierWorkers.Dec()
   204  
   205  	q.mtx.Lock()
   206  	defer q.mtx.Unlock()
   207  	q.queues.removeQuerierConnection(querier, time.Now())
   208  }
   209  
   210  func (q *RequestQueue) NotifyQuerierShutdown(querierID string) {
   211  	q.mtx.Lock()
   212  	defer q.mtx.Unlock()
   213  	q.queues.notifyQuerierShutdown(querierID)
   214  }
   215  
   216  func (q *RequestQueue) GetConnectedQuerierWorkersMetric() float64 {
   217  	return float64(q.connectedQuerierWorkers.Load())
   218  }
   219  
   220  // contextCond is a *sync.Cond with Wait() method overridden to support context-based waiting.
   221  type contextCond struct {
   222  	*sync.Cond
   223  
   224  	// testHookBeforeWaiting is called before calling Cond.Wait() if it's not nil.
   225  	// Yes, it's ugly, but the http package settled jurisprudence:
   226  	// https://github.com/golang/go/blob/6178d25fc0b28724b1b5aec2b1b74fc06d9294c7/src/net/http/client.go#L596-L601
   227  	testHookBeforeWaiting func()
   228  }
   229  
   230  // Wait does c.cond.Wait() but will also return if the context provided is done.
   231  // All the documentation of sync.Cond.Wait() applies, but it's especially important to remember that the mutex of
   232  // the cond should be held while Wait() is called (and mutex will be held once it returns)
   233  func (c contextCond) Wait(ctx context.Context) {
   234  	// "condWait" goroutine does q.cond.Wait() and signals through condWait channel.
   235  	condWait := make(chan struct{})
   236  	go func() {
   237  		if c.testHookBeforeWaiting != nil {
   238  			c.testHookBeforeWaiting()
   239  		}
   240  		c.Cond.Wait()
   241  		close(condWait)
   242  	}()
   243  
   244  	// "waiting" goroutine: signals that the condWait goroutine has started waiting.
   245  	// Notice that a closed waiting channel implies that the goroutine above has started waiting
   246  	// (because it has unlocked the mutex), but the other way is not true:
   247  	// - condWait it may have unlocked and is waiting, but someone else locked the mutex faster than us:
   248  	//   in this case that caller will eventually unlock, and we'll be able to enter here.
   249  	// - condWait called Wait(), unlocked, received a broadcast and locked again faster than we were able to lock here:
   250  	//   in this case condWait channel will be closed, and this goroutine will be waiting until we unlock.
   251  	waiting := make(chan struct{})
   252  	go func() {
   253  		c.L.Lock()
   254  		close(waiting)
   255  		c.L.Unlock()
   256  	}()
   257  
   258  	select {
   259  	case <-condWait:
   260  		// We don't know whether the waiting goroutine is done or not, but we don't care:
   261  		// it will be done once nobody is fighting for the mutex anymore.
   262  	case <-ctx.Done():
   263  		// In order to avoid leaking the condWait goroutine, we can send a broadcast.
   264  		// Before sending the broadcast we need to make sure that condWait goroutine is already waiting (or has already waited).
   265  		select {
   266  		case <-condWait:
   267  			// No need to broadcast as q.cond.Wait() has returned already.
   268  			return
   269  		case <-waiting:
   270  			// q.cond.Wait() might be still waiting (or maybe not!), so we'll poke it just in case.
   271  			c.Broadcast()
   272  		}
   273  
   274  		// Make sure we are not waiting anymore, we need to do that before returning as the caller will need to unlock the mutex.
   275  		<-condWait
   276  	}
   277  }