github.com/grafana/pyroscope@v1.18.0/pkg/scheduler/scheduler.go

github.com/grafana/pyroscope@v1.18.0/pkg/scheduler/scheduler.go (about)

     1  // SPDX-License-Identifier: AGPL-3.0-only
     2  // Provenance-includes-location: https://github.com/cortexproject/cortex/blob/master/pkg/scheduler/scheduler.go
     3  // Provenance-includes-license: Apache-2.0
     4  // Provenance-includes-copyright: The Cortex Authors.
     5  
     6  package scheduler
     7  
     8  import (
     9  	"context"
    10  	"flag"
    11  	"io"
    12  	"net/http"
    13  	"sync"
    14  	"time"
    15  
    16  	"connectrpc.com/connect"
    17  	"github.com/go-kit/log"
    18  	"github.com/go-kit/log/level"
    19  	"github.com/grafana/dskit/grpcclient"
    20  	"github.com/grafana/dskit/middleware"
    21  	"github.com/grafana/dskit/ring"
    22  	"github.com/grafana/dskit/services"
    23  	"github.com/grafana/dskit/tenant"
    24  	"github.com/grafana/dskit/user"
    25  	otgrpc "github.com/opentracing-contrib/go-grpc"
    26  	"github.com/opentracing/opentracing-go"
    27  	"github.com/pkg/errors"
    28  	"github.com/prometheus/client_golang/prometheus"
    29  	"github.com/prometheus/client_golang/prometheus/promauto"
    30  	"google.golang.org/grpc"
    31  
    32  	"github.com/grafana/pyroscope/pkg/frontend/frontendpb"
    33  	"github.com/grafana/pyroscope/pkg/scheduler/queue"
    34  	"github.com/grafana/pyroscope/pkg/scheduler/schedulerdiscovery"
    35  	"github.com/grafana/pyroscope/pkg/scheduler/schedulerpb"
    36  	"github.com/grafana/pyroscope/pkg/util"
    37  	"github.com/grafana/pyroscope/pkg/util/httpgrpc"
    38  	"github.com/grafana/pyroscope/pkg/util/httpgrpcutil"
    39  	"github.com/grafana/pyroscope/pkg/util/validation"
    40  )
    41  
    42  // Scheduler is responsible for queueing and dispatching queries to Queriers.
    43  type Scheduler struct {
    44  	services.Service
    45  
    46  	cfg Config
    47  	log log.Logger
    48  
    49  	limits Limits
    50  
    51  	connectedFrontendsMu sync.Mutex
    52  	connectedFrontends   map[string]*connectedFrontend
    53  
    54  	requestQueue *queue.RequestQueue
    55  	activeUsers  *util.ActiveUsersCleanupService
    56  
    57  	pendingRequestsMu sync.Mutex
    58  	pendingRequests   map[requestKey]*schedulerRequest // Request is kept in this map even after being dispatched to querier. It can still be canceled at that time.
    59  
    60  	// The ring is used to let other components discover query-scheduler replicas.
    61  	// The ring is optional.
    62  	schedulerLifecycler *ring.BasicLifecycler
    63  
    64  	// Subservices manager.
    65  	subservices        *services.Manager
    66  	subservicesWatcher *services.FailureWatcher
    67  
    68  	// Metrics.
    69  	queueLength              *prometheus.GaugeVec
    70  	discardedRequests        *prometheus.CounterVec
    71  	cancelledRequests        *prometheus.CounterVec
    72  	connectedQuerierClients  prometheus.GaugeFunc
    73  	connectedFrontendClients prometheus.GaugeFunc
    74  	queueDuration            prometheus.Histogram
    75  	inflightRequests         prometheus.Summary
    76  
    77  	schedulerpb.UnimplementedSchedulerForFrontendServer
    78  	schedulerpb.UnimplementedSchedulerForQuerierServer
    79  }
    80  
    81  type requestKey struct {
    82  	frontendAddr string
    83  	queryID      uint64
    84  }
    85  
    86  type connectedFrontend struct {
    87  	connections int
    88  
    89  	// This context is used for running all queries from the same frontend.
    90  	// When last frontend connection is closed, context is canceled.
    91  	ctx    context.Context
    92  	cancel context.CancelFunc
    93  }
    94  
    95  type Config struct {
    96  	MaxOutstandingPerTenant int                       `yaml:"max_outstanding_requests_per_tenant"`
    97  	QuerierForgetDelay      time.Duration             `yaml:"querier_forget_delay" category:"experimental"`
    98  	GRPCClientConfig        grpcclient.Config         `yaml:"grpc_client_config" doc:"description=This configures the gRPC client used to report errors back to the query-frontend."`
    99  	ServiceDiscovery        schedulerdiscovery.Config `yaml:",inline"`
   100  
   101  	// Dial options used to initiate outgoing gRPC connections.
   102  	// Intended to be used by tests to use in-memory network connections.
   103  	DialOpts []grpc.DialOption `yaml:"-"`
   104  }
   105  
   106  func (cfg *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) {
   107  	f.IntVar(&cfg.MaxOutstandingPerTenant, "query-scheduler.max-outstanding-requests-per-tenant", 100, "Maximum number of outstanding requests per tenant per query-scheduler. In-flight requests above this limit will fail with HTTP response status code 429.")
   108  	f.DurationVar(&cfg.QuerierForgetDelay, "query-scheduler.querier-forget-delay", 0, "If a querier disconnects without sending notification about graceful shutdown, the query-scheduler will keep the querier in the tenant's shard until the forget delay has passed. This feature is useful to reduce the blast radius when shuffle-sharding is enabled.")
   109  	cfg.GRPCClientConfig.RegisterFlagsWithPrefix("query-scheduler.grpc-client-config", f)
   110  	cfg.ServiceDiscovery.RegisterFlags(f, logger)
   111  }
   112  
   113  func (cfg *Config) Validate() error {
   114  	return cfg.ServiceDiscovery.Validate()
   115  }
   116  
   117  // NewScheduler creates a new Scheduler.
   118  func NewScheduler(cfg Config, limits Limits, log log.Logger, registerer prometheus.Registerer) (*Scheduler, error) {
   119  	var err error
   120  
   121  	s := &Scheduler{
   122  		cfg:    cfg,
   123  		log:    log,
   124  		limits: limits,
   125  
   126  		pendingRequests:    map[requestKey]*schedulerRequest{},
   127  		connectedFrontends: map[string]*connectedFrontend{},
   128  		subservicesWatcher: services.NewFailureWatcher(),
   129  	}
   130  
   131  	s.queueLength = promauto.With(registerer).NewGaugeVec(prometheus.GaugeOpts{
   132  		Name: "pyroscope_query_scheduler_queue_length",
   133  		Help: "Number of queries in the queue.",
   134  	}, []string{"tenant"})
   135  
   136  	s.cancelledRequests = promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{
   137  		Name: "pyroscope_query_scheduler_cancelled_requests_total",
   138  		Help: "Total number of query requests that were cancelled after enqueuing.",
   139  	}, []string{"tenant"})
   140  	s.discardedRequests = promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{
   141  		Name: "pyroscope_query_scheduler_discarded_requests_total",
   142  		Help: "Total number of query requests discarded.",
   143  	}, []string{"tenant"})
   144  	s.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, cfg.QuerierForgetDelay, s.queueLength, s.discardedRequests)
   145  
   146  	s.queueDuration = promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
   147  		Name:    "pyroscope_query_scheduler_queue_duration_seconds",
   148  		Help:    "Time spend by requests in queue before getting picked up by a querier.",
   149  		Buckets: prometheus.DefBuckets,
   150  	})
   151  	s.connectedQuerierClients = promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{
   152  		Name: "pyroscope_query_scheduler_connected_querier_clients",
   153  		Help: "Number of querier worker clients currently connected to the query-scheduler.",
   154  	}, s.requestQueue.GetConnectedQuerierWorkersMetric)
   155  	s.connectedFrontendClients = promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{
   156  		Name: "pyroscope_query_scheduler_connected_frontend_clients",
   157  		Help: "Number of query-frontend worker clients currently connected to the query-scheduler.",
   158  	}, s.getConnectedFrontendClientsMetric)
   159  
   160  	s.inflightRequests = promauto.With(registerer).NewSummary(prometheus.SummaryOpts{
   161  		Name:       "pyroscope_query_scheduler_inflight_requests",
   162  		Help:       "Number of inflight requests (either queued or processing) sampled at a regular interval. Quantile buckets keep track of inflight requests over the last 60s.",
   163  		Objectives: map[float64]float64{0.5: 0.05, 0.75: 0.02, 0.8: 0.02, 0.9: 0.01, 0.95: 0.01, 0.99: 0.001},
   164  		MaxAge:     time.Minute,
   165  		AgeBuckets: 6,
   166  	})
   167  
   168  	s.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(s.cleanupMetricsForInactiveUser)
   169  	subservices := []services.Service{s.requestQueue, s.activeUsers}
   170  
   171  	// Init the ring only if the ring-based service discovery mode is used.
   172  	if cfg.ServiceDiscovery.Mode == schedulerdiscovery.ModeRing {
   173  		s.schedulerLifecycler, err = schedulerdiscovery.NewRingLifecycler(cfg.ServiceDiscovery.SchedulerRing, log, registerer)
   174  		if err != nil {
   175  			return nil, err
   176  		}
   177  
   178  		subservices = append(subservices, s.schedulerLifecycler)
   179  	}
   180  
   181  	s.subservices, err = services.NewManager(subservices...)
   182  	if err != nil {
   183  		return nil, err
   184  	}
   185  
   186  	s.Service = services.NewBasicService(s.starting, s.running, s.stopping)
   187  	return s, nil
   188  }
   189  
   190  // Limits needed for the Query Scheduler - interface used for decoupling.
   191  type Limits interface {
   192  	// MaxQueriersPerTenant returns max queriers to use per tenant, or 0 if shuffle sharding is disabled.
   193  	MaxQueriersPerTenant(tenant string) int
   194  }
   195  
   196  type schedulerRequest struct {
   197  	frontendAddress string
   198  	userID          string
   199  	queryID         uint64
   200  	request         *httpgrpc.HTTPRequest
   201  	statsEnabled    bool
   202  
   203  	enqueueTime time.Time
   204  
   205  	ctx       context.Context
   206  	ctxCancel context.CancelFunc
   207  	queueSpan opentracing.Span
   208  
   209  	// This is only used for testing.
   210  	parentSpanContext opentracing.SpanContext
   211  }
   212  
   213  // FrontendLoop handles connection from frontend.
   214  func (s *Scheduler) FrontendLoop(ctx context.Context, frontend *connect.BidiStream[schedulerpb.FrontendToScheduler, schedulerpb.SchedulerToFrontend]) error {
   215  	frontendAddress, frontendCtx, err := s.frontendConnected(frontend)
   216  	if err != nil {
   217  		return err
   218  	}
   219  	defer s.frontendDisconnected(frontendAddress)
   220  
   221  	// Response to INIT. If scheduler is not running, we skip for-loop, send SHUTTING_DOWN and exit this method.
   222  	if s.State() == services.Running {
   223  		if err := frontend.Send(&schedulerpb.SchedulerToFrontend{Status: schedulerpb.SchedulerToFrontendStatus_OK}); err != nil {
   224  			return err
   225  		}
   226  	}
   227  
   228  	// We stop accepting new queries in Stopping state. By returning quickly, we disconnect frontends, which in turns
   229  	// cancels all their queries.
   230  	for s.State() == services.Running {
   231  		msg, err := frontend.Receive()
   232  		if err != nil {
   233  			// No need to report this as error, it is expected when query-frontend performs SendClose() (as frontendSchedulerWorker does).
   234  			if errors.Is(err, io.EOF) {
   235  				return nil
   236  			}
   237  			return err
   238  		}
   239  
   240  		if s.State() != services.Running {
   241  			break // break out of the loop, and send SHUTTING_DOWN message.
   242  		}
   243  
   244  		var resp *schedulerpb.SchedulerToFrontend
   245  
   246  		switch msg.GetType() {
   247  		case schedulerpb.FrontendToSchedulerType_ENQUEUE:
   248  			err = s.enqueueRequest(frontendCtx, frontendAddress, msg)
   249  			switch {
   250  			case err == nil:
   251  				resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.SchedulerToFrontendStatus_OK}
   252  			case errors.Is(err, queue.ErrTooManyRequests):
   253  				resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.SchedulerToFrontendStatus_TOO_MANY_REQUESTS_PER_TENANT}
   254  			default:
   255  				resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.SchedulerToFrontendStatus_ERROR, Error: err.Error()}
   256  			}
   257  
   258  		case schedulerpb.FrontendToSchedulerType_CANCEL:
   259  			s.cancelRequestAndRemoveFromPending(frontendAddress, msg.QueryID)
   260  			resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.SchedulerToFrontendStatus_OK}
   261  
   262  		default:
   263  			level.Error(s.log).Log("msg", "unknown request type from frontend", "addr", frontendAddress, "type", msg.GetType())
   264  			return errors.New("unknown request type")
   265  		}
   266  
   267  		err = frontend.Send(resp)
   268  		// Failure to send response results in ending this connection.
   269  		if err != nil {
   270  			return err
   271  		}
   272  	}
   273  
   274  	// Report shutdown back to frontend, so that it can retry with different scheduler. Also stop the frontend loop.
   275  	return frontend.Send(&schedulerpb.SchedulerToFrontend{Status: schedulerpb.SchedulerToFrontendStatus_SHUTTING_DOWN})
   276  }
   277  
   278  func (s *Scheduler) frontendConnected(frontend *connect.BidiStream[schedulerpb.FrontendToScheduler, schedulerpb.SchedulerToFrontend]) (string, context.Context, error) {
   279  	msg, err := frontend.Receive()
   280  	if err != nil {
   281  		return "", nil, err
   282  	}
   283  	if msg.Type != schedulerpb.FrontendToSchedulerType_INIT || msg.FrontendAddress == "" {
   284  		return "", nil, errors.New("no frontend address")
   285  	}
   286  
   287  	s.connectedFrontendsMu.Lock()
   288  	defer s.connectedFrontendsMu.Unlock()
   289  
   290  	cf := s.connectedFrontends[msg.FrontendAddress]
   291  	if cf == nil {
   292  		cf = &connectedFrontend{
   293  			connections: 0,
   294  		}
   295  		cf.ctx, cf.cancel = context.WithCancel(context.Background())
   296  		s.connectedFrontends[msg.FrontendAddress] = cf
   297  	}
   298  
   299  	cf.connections++
   300  	return msg.FrontendAddress, cf.ctx, nil
   301  }
   302  
   303  func (s *Scheduler) frontendDisconnected(frontendAddress string) {
   304  	s.connectedFrontendsMu.Lock()
   305  	defer s.connectedFrontendsMu.Unlock()
   306  
   307  	cf := s.connectedFrontends[frontendAddress]
   308  	cf.connections--
   309  	if cf.connections == 0 {
   310  		delete(s.connectedFrontends, frontendAddress)
   311  		cf.cancel()
   312  	}
   313  }
   314  
   315  func (s *Scheduler) enqueueRequest(frontendContext context.Context, frontendAddr string, msg *schedulerpb.FrontendToScheduler) error {
   316  	// Create new context for this request, to support cancellation.
   317  	ctx, cancel := context.WithCancel(frontendContext)
   318  	shouldCancel := true
   319  	defer func() {
   320  		if shouldCancel {
   321  			cancel()
   322  		}
   323  	}()
   324  
   325  	// Extract tracing information from headers in HTTP request. FrontendContext doesn't have the correct tracing
   326  	// information, since that is a long-running request.
   327  	tracer := opentracing.GlobalTracer()
   328  	parentSpanContext, err := httpgrpcutil.GetParentSpanForRequest(tracer, msg.HttpRequest)
   329  	if err != nil {
   330  		return err
   331  	}
   332  
   333  	userID := msg.GetUserID()
   334  
   335  	req := &schedulerRequest{
   336  		frontendAddress: frontendAddr,
   337  		userID:          msg.UserID,
   338  		queryID:         msg.QueryID,
   339  		request:         msg.HttpRequest,
   340  		statsEnabled:    msg.StatsEnabled,
   341  	}
   342  
   343  	now := time.Now()
   344  
   345  	req.parentSpanContext = parentSpanContext
   346  	req.queueSpan, req.ctx = opentracing.StartSpanFromContextWithTracer(ctx, tracer, "queued", opentracing.ChildOf(parentSpanContext))
   347  	req.enqueueTime = now
   348  	req.ctxCancel = cancel
   349  
   350  	// aggregate the max queriers limit in the case of a multi tenant query
   351  	tenantIDs, err := tenant.TenantIDsFromOrgID(userID)
   352  	if err != nil {
   353  		return err
   354  	}
   355  	maxQueriers := validation.SmallestPositiveNonZeroIntPerTenant(tenantIDs, s.limits.MaxQueriersPerTenant)
   356  
   357  	s.activeUsers.UpdateUserTimestamp(userID, now)
   358  	return s.requestQueue.EnqueueRequest(userID, req, maxQueriers, func() {
   359  		shouldCancel = false
   360  
   361  		s.pendingRequestsMu.Lock()
   362  		s.pendingRequests[requestKey{frontendAddr: frontendAddr, queryID: msg.QueryID}] = req
   363  		s.pendingRequestsMu.Unlock()
   364  	})
   365  }
   366  
   367  // This method doesn't do removal from the queue.
   368  func (s *Scheduler) cancelRequestAndRemoveFromPending(frontendAddr string, queryID uint64) {
   369  	s.pendingRequestsMu.Lock()
   370  	defer s.pendingRequestsMu.Unlock()
   371  
   372  	key := requestKey{frontendAddr: frontendAddr, queryID: queryID}
   373  	req := s.pendingRequests[key]
   374  	if req != nil {
   375  		req.ctxCancel()
   376  	}
   377  
   378  	delete(s.pendingRequests, key)
   379  }
   380  
   381  // BidiStreamCloser is a wrapper around BidiStream that allows to close it.
   382  // Once closed, it will return io.EOF on Receive and Send.
   383  type BidiStreamCloser[Req, Res any] struct {
   384  	stream *connect.BidiStream[Req, Res]
   385  	lock   sync.Mutex
   386  }
   387  
   388  func (c *BidiStreamCloser[Req, Res]) Close() {
   389  	c.lock.Lock()
   390  	defer c.lock.Unlock()
   391  
   392  	if c.stream != nil {
   393  		c.stream = nil
   394  	}
   395  }
   396  
   397  func (c *BidiStreamCloser[Req, Res]) Receive() (*Req, error) {
   398  	c.lock.Lock()
   399  	defer c.lock.Unlock()
   400  
   401  	if c.stream == nil {
   402  		return nil, io.EOF
   403  	}
   404  
   405  	return c.stream.Receive()
   406  }
   407  
   408  func (b *BidiStreamCloser[Req, Res]) Send(msg *Res) error {
   409  	b.lock.Lock()
   410  	defer b.lock.Unlock()
   411  
   412  	if b.stream == nil {
   413  		return io.EOF
   414  	}
   415  	return b.stream.Send(msg)
   416  }
   417  
   418  // QuerierLoop is started by querier to receive queries from scheduler.
   419  func (s *Scheduler) QuerierLoop(ctx context.Context, bidi *connect.BidiStream[schedulerpb.QuerierToScheduler, schedulerpb.SchedulerToQuerier]) error {
   420  	querier := &BidiStreamCloser[schedulerpb.QuerierToScheduler, schedulerpb.SchedulerToQuerier]{
   421  		stream: bidi,
   422  	}
   423  	defer querier.Close()
   424  	resp, err := querier.Receive()
   425  	if err != nil {
   426  		return err
   427  	}
   428  
   429  	querierID := resp.GetQuerierID()
   430  
   431  	s.requestQueue.RegisterQuerierConnection(querierID)
   432  	defer s.requestQueue.UnregisterQuerierConnection(querierID)
   433  
   434  	lastUserIndex := queue.FirstUser()
   435  
   436  	// In stopping state scheduler is not accepting new queries, but still dispatching queries in the queues.
   437  	for s.isRunningOrStopping() {
   438  		req, idx, err := s.requestQueue.GetNextRequestForQuerier(ctx, lastUserIndex, querierID)
   439  		if err != nil {
   440  			// Return a more clear error if the queue is stopped because the query-scheduler is not running.
   441  			if errors.Is(err, queue.ErrStopped) && !s.isRunning() {
   442  				return schedulerpb.ErrSchedulerIsNotRunning
   443  			}
   444  
   445  			return err
   446  		}
   447  		lastUserIndex = idx
   448  
   449  		r := req.(*schedulerRequest)
   450  
   451  		s.queueDuration.Observe(time.Since(r.enqueueTime).Seconds())
   452  		r.queueSpan.Finish()
   453  
   454  		/*
   455  		  We want to dequeue the next unexpired request from the chosen tenant queue.
   456  		  The chance of choosing a particular tenant for dequeueing is (1/active_tenants).
   457  		  This is problematic under load, especially with other middleware enabled such as
   458  		  querier.split-by-interval, where one request may fan out into many.
   459  		  If expired requests aren't exhausted before checking another tenant, it would take
   460  		  n_active_tenants * n_expired_requests_at_front_of_queue requests being processed
   461  		  before an active request was handled for the tenant in question.
   462  		  If this tenant meanwhile continued to queue requests,
   463  		  it's possible that it's own queue would perpetually contain only expired requests.
   464  		*/
   465  
   466  		if r.ctx.Err() != nil {
   467  			// Remove from pending requests.
   468  			s.cancelRequestAndRemoveFromPending(r.frontendAddress, r.queryID)
   469  
   470  			lastUserIndex = lastUserIndex.ReuseLastUser()
   471  			continue
   472  		}
   473  
   474  		if err := s.forwardRequestToQuerier(querier, r); err != nil {
   475  			return err
   476  		}
   477  	}
   478  
   479  	return schedulerpb.ErrSchedulerIsNotRunning
   480  }
   481  
   482  func (s *Scheduler) NotifyQuerierShutdown(ctx context.Context, req *connect.Request[schedulerpb.NotifyQuerierShutdownRequest]) (*connect.Response[schedulerpb.NotifyQuerierShutdownResponse], error) {
   483  	level.Info(s.log).Log("msg", "received shutdown notification from querier", "querier", req.Msg.GetQuerierID())
   484  	s.requestQueue.NotifyQuerierShutdown(req.Msg.GetQuerierID())
   485  
   486  	return connect.NewResponse(&schedulerpb.NotifyQuerierShutdownResponse{}), nil
   487  }
   488  
   489  func (s *Scheduler) forwardRequestToQuerier(querier *BidiStreamCloser[schedulerpb.QuerierToScheduler, schedulerpb.SchedulerToQuerier], req *schedulerRequest) error {
   490  	// Make sure to cancel request at the end to cleanup resources.
   491  	defer s.cancelRequestAndRemoveFromPending(req.frontendAddress, req.queryID)
   492  
   493  	// Handle the stream sending & receiving on a goroutine so we can
   494  	// monitoring the contexts in a select and cancel things appropriately.
   495  	errCh := make(chan error, 1)
   496  	go func() {
   497  		err := querier.Send(&schedulerpb.SchedulerToQuerier{
   498  			UserID:          req.userID,
   499  			QueryID:         req.queryID,
   500  			FrontendAddress: req.frontendAddress,
   501  			HttpRequest:     req.request,
   502  			StatsEnabled:    req.statsEnabled,
   503  		})
   504  		if err != nil {
   505  			errCh <- err
   506  			return
   507  		}
   508  
   509  		_, err = querier.Receive()
   510  		errCh <- err
   511  	}()
   512  
   513  	select {
   514  	case <-req.ctx.Done():
   515  		// If the upstream request is cancelled (eg. frontend issued CANCEL or closed connection),
   516  		// we need to cancel the downstream req. Only way we can do that is to close the stream (by returning error here).
   517  		// Querier is expecting this semantics.
   518  		s.cancelledRequests.WithLabelValues(req.userID).Inc()
   519  		return req.ctx.Err()
   520  
   521  	case err := <-errCh:
   522  		// Is there was an error handling this request due to network IO,
   523  		// then error out this upstream request _and_ stream.
   524  
   525  		if err != nil {
   526  			s.forwardErrorToFrontend(req.ctx, req, err)
   527  		}
   528  		return err
   529  	}
   530  }
   531  
   532  func (s *Scheduler) forwardErrorToFrontend(ctx context.Context, req *schedulerRequest, requestErr error) {
   533  	opts, err := s.cfg.GRPCClientConfig.DialOption([]grpc.UnaryClientInterceptor{
   534  		otgrpc.OpenTracingClientInterceptor(opentracing.GlobalTracer()),
   535  		middleware.ClientUserHeaderInterceptor,
   536  	},
   537  		nil, nil)
   538  	if err != nil {
   539  		level.Warn(s.log).Log("msg", "failed to create gRPC options for the connection to frontend to report error", "frontend", req.frontendAddress, "err", err, "requestErr", requestErr)
   540  		return
   541  	}
   542  
   543  	opts = append(opts, s.cfg.DialOpts...)
   544  	conn, err := grpc.DialContext(ctx, req.frontendAddress, opts...)
   545  	if err != nil {
   546  		level.Warn(s.log).Log("msg", "failed to create gRPC connection to frontend to report error", "frontend", req.frontendAddress, "err", err, "requestErr", requestErr)
   547  		return
   548  	}
   549  
   550  	defer func() {
   551  		_ = conn.Close()
   552  	}()
   553  
   554  	client := frontendpb.NewFrontendForQuerierClient(conn)
   555  
   556  	userCtx := user.InjectOrgID(ctx, req.userID)
   557  	_, err = client.QueryResult(userCtx, &frontendpb.QueryResultRequest{
   558  		QueryID: req.queryID,
   559  		HttpResponse: &httpgrpc.HTTPResponse{
   560  			Code: http.StatusInternalServerError,
   561  			Body: []byte(requestErr.Error()),
   562  		},
   563  	})
   564  
   565  	if err != nil {
   566  		level.Warn(s.log).Log("msg", "failed to forward error to frontend", "frontend", req.frontendAddress, "err", err, "requestErr", requestErr)
   567  		return
   568  	}
   569  }
   570  
   571  func (s *Scheduler) isRunning() bool {
   572  	st := s.State()
   573  	return st == services.Running
   574  }
   575  
   576  func (s *Scheduler) isRunningOrStopping() bool {
   577  	st := s.State()
   578  	return st == services.Running || st == services.Stopping
   579  }
   580  
   581  func (s *Scheduler) starting(ctx context.Context) error {
   582  	s.subservicesWatcher.WatchManager(s.subservices)
   583  
   584  	if err := services.StartManagerAndAwaitHealthy(ctx, s.subservices); err != nil {
   585  		return errors.Wrap(err, "unable to start scheduler subservices")
   586  	}
   587  
   588  	return nil
   589  }
   590  
   591  func (s *Scheduler) running(ctx context.Context) error {
   592  	// We observe inflight requests frequently and at regular intervals, to have a good
   593  	// approximation of max inflight requests over percentiles of time. We also do it with
   594  	// a ticker so that we keep tracking it even if we have no new queries but stuck inflight
   595  	// requests (e.g. queriers are all crashing).
   596  	inflightRequestsTicker := time.NewTicker(250 * time.Millisecond)
   597  	defer inflightRequestsTicker.Stop()
   598  
   599  	for {
   600  		select {
   601  		case <-inflightRequestsTicker.C:
   602  			s.pendingRequestsMu.Lock()
   603  			inflight := len(s.pendingRequests)
   604  			s.pendingRequestsMu.Unlock()
   605  
   606  			s.inflightRequests.Observe(float64(inflight))
   607  		case <-ctx.Done():
   608  			return nil
   609  		case err := <-s.subservicesWatcher.Chan():
   610  			return errors.Wrap(err, "scheduler subservice failed")
   611  		}
   612  	}
   613  }
   614  
   615  // Close the Scheduler.
   616  func (s *Scheduler) stopping(_ error) error {
   617  	// This will also stop the requests queue, which stop accepting new requests and errors out any pending requests.
   618  	return services.StopManagerAndAwaitStopped(context.Background(), s.subservices)
   619  }
   620  
   621  func (s *Scheduler) cleanupMetricsForInactiveUser(user string) {
   622  	s.queueLength.DeleteLabelValues(user)
   623  	s.discardedRequests.DeleteLabelValues(user)
   624  	s.cancelledRequests.DeleteLabelValues(user)
   625  }
   626  
   627  func (s *Scheduler) getConnectedFrontendClientsMetric() float64 {
   628  	s.connectedFrontendsMu.Lock()
   629  	defer s.connectedFrontendsMu.Unlock()
   630  
   631  	count := 0
   632  	for _, workers := range s.connectedFrontends {
   633  		count += workers.connections
   634  	}
   635  
   636  	return float64(count)
   637  }
   638  
   639  func (s *Scheduler) RingHandler(w http.ResponseWriter, req *http.Request) {
   640  	if s.schedulerLifecycler != nil {
   641  		s.schedulerLifecycler.ServeHTTP(w, req)
   642  		return
   643  	}
   644  
   645  	ringDisabledPage := `
   646  		<!DOCTYPE html>
   647  		<html>
   648  			<head>
   649  				<meta charset="UTF-8">
   650  				<title>Query-scheduler Status</title>
   651  			</head>
   652  			<body>
   653  				<h1>Query-scheduler Status</h1>
   654  				<p>Query-scheduler hash ring is disabled.</p>
   655  			</body>
   656  		</html>`
   657  	util.WriteHTMLResponse(w, ringDisabledPage)
   658  }