github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/scheduler/scheduler.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"context"
     5  	"flag"
     6  	"io"
     7  	"net/http"
     8  	"net/textproto"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/go-kit/log"
    13  	"github.com/go-kit/log/level"
    14  	"github.com/grafana/dskit/grpcclient"
    15  	"github.com/grafana/dskit/kv"
    16  	"github.com/grafana/dskit/ring"
    17  	"github.com/grafana/dskit/services"
    18  	otgrpc "github.com/opentracing-contrib/go-grpc"
    19  	"github.com/opentracing/opentracing-go"
    20  	"github.com/pkg/errors"
    21  	"github.com/prometheus/client_golang/prometheus"
    22  	"github.com/prometheus/client_golang/prometheus/promauto"
    23  	"github.com/weaveworks/common/httpgrpc"
    24  	"github.com/weaveworks/common/middleware"
    25  	"github.com/weaveworks/common/user"
    26  	"go.uber.org/atomic"
    27  	"google.golang.org/grpc"
    28  
    29  	"github.com/grafana/dskit/tenant"
    30  
    31  	"github.com/grafana/loki/pkg/lokifrontend/frontend/v2/frontendv2pb"
    32  	"github.com/grafana/loki/pkg/scheduler/queue"
    33  	"github.com/grafana/loki/pkg/scheduler/schedulerpb"
    34  	"github.com/grafana/loki/pkg/util"
    35  	lokiutil "github.com/grafana/loki/pkg/util"
    36  	lokigrpc "github.com/grafana/loki/pkg/util/httpgrpc"
    37  	lokihttpreq "github.com/grafana/loki/pkg/util/httpreq"
    38  	util_log "github.com/grafana/loki/pkg/util/log"
    39  	"github.com/grafana/loki/pkg/util/validation"
    40  )
    41  
    42  var errSchedulerIsNotRunning = errors.New("scheduler is not running")
    43  
    44  const (
    45  	// ringAutoForgetUnhealthyPeriods is how many consecutive timeout periods an unhealthy instance
    46  	// in the ring will be automatically removed.
    47  	ringAutoForgetUnhealthyPeriods = 10
    48  
    49  	// ringKey is the key under which we store the store gateways ring in the KVStore.
    50  	ringKey = "scheduler"
    51  
    52  	// ringNameForServer is the name of the ring used by the compactor server.
    53  	ringNameForServer = "scheduler"
    54  
    55  	// ringReplicationFactor should be 2 because we want 2 schedulers.
    56  	ringReplicationFactor = 2
    57  
    58  	// ringNumTokens sets our single token in the ring,
    59  	// we only need to insert 1 token to be used for leader election purposes.
    60  	ringNumTokens = 1
    61  
    62  	// ringCheckPeriod is how often we check the ring to see if this instance is still in
    63  	// the replicaset of instances to act as schedulers.
    64  	ringCheckPeriod = 3 * time.Second
    65  )
    66  
    67  // Scheduler is responsible for queueing and dispatching queries to Queriers.
    68  type Scheduler struct {
    69  	services.Service
    70  
    71  	cfg Config
    72  	log log.Logger
    73  
    74  	limits Limits
    75  
    76  	connectedFrontendsMu sync.Mutex
    77  	connectedFrontends   map[string]*connectedFrontend
    78  
    79  	requestQueue *queue.RequestQueue
    80  	activeUsers  *util.ActiveUsersCleanupService
    81  
    82  	pendingRequestsMu sync.Mutex
    83  	pendingRequests   map[requestKey]*schedulerRequest // Request is kept in this map even after being dispatched to querier. It can still be canceled at that time.
    84  
    85  	// Subservices manager.
    86  	subservices        *services.Manager
    87  	subservicesWatcher *services.FailureWatcher
    88  
    89  	// Metrics.
    90  	queueLength              *prometheus.GaugeVec
    91  	discardedRequests        *prometheus.CounterVec
    92  	connectedQuerierClients  prometheus.GaugeFunc
    93  	connectedFrontendClients prometheus.GaugeFunc
    94  	queueDuration            prometheus.Histogram
    95  	schedulerRunning         prometheus.Gauge
    96  	inflightRequests         prometheus.Summary
    97  
    98  	// Ring used for finding schedulers
    99  	ringLifecycler *ring.BasicLifecycler
   100  	ring           *ring.Ring
   101  
   102  	// Controls for this being a chosen scheduler
   103  	shouldRun atomic.Bool
   104  }
   105  
   106  type requestKey struct {
   107  	frontendAddr string
   108  	queryID      uint64
   109  }
   110  
   111  type connectedFrontend struct {
   112  	connections int
   113  	frontend    schedulerpb.SchedulerForFrontend_FrontendLoopServer
   114  
   115  	// This context is used for running all queries from the same frontend.
   116  	// When last frontend connection is closed, context is canceled.
   117  	ctx    context.Context
   118  	cancel context.CancelFunc
   119  }
   120  
   121  type Config struct {
   122  	MaxOutstandingPerTenant int               `yaml:"max_outstanding_requests_per_tenant"`
   123  	QuerierForgetDelay      time.Duration     `yaml:"querier_forget_delay"`
   124  	GRPCClientConfig        grpcclient.Config `yaml:"grpc_client_config" doc:"description=This configures the gRPC client used to report errors back to the query-frontend."`
   125  	// Schedulers ring
   126  	UseSchedulerRing bool                `yaml:"use_scheduler_ring"`
   127  	SchedulerRing    lokiutil.RingConfig `yaml:"scheduler_ring,omitempty"`
   128  }
   129  
   130  func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
   131  	f.IntVar(&cfg.MaxOutstandingPerTenant, "query-scheduler.max-outstanding-requests-per-tenant", 100, "Maximum number of outstanding requests per tenant per query scheduler. In-flight requests above this limit will fail with HTTP response status code 429.")
   132  	f.DurationVar(&cfg.QuerierForgetDelay, "query-scheduler.querier-forget-delay", 0, "If a querier disconnects without sending notification about graceful shutdown, the query-scheduler will keep the querier in the tenant's shard until the forget delay has passed. This feature is useful to reduce the blast radius when shuffle-sharding is enabled.")
   133  	cfg.GRPCClientConfig.RegisterFlagsWithPrefix("query-scheduler.grpc-client-config", f)
   134  	f.BoolVar(&cfg.UseSchedulerRing, "query-scheduler.use-scheduler-ring", false, "Set to true to have the query scheduler create a ring and the frontend and frontend_worker use this ring to get the addresses of the query schedulers. If frontend_address and scheduler_address are not present in the config this value will be toggle by Loki to true")
   135  	cfg.SchedulerRing.RegisterFlagsWithPrefix("query-scheduler.", "collectors/", f)
   136  }
   137  
   138  // NewScheduler creates a new Scheduler.
   139  func NewScheduler(cfg Config, limits Limits, log log.Logger, registerer prometheus.Registerer) (*Scheduler, error) {
   140  	s := &Scheduler{
   141  		cfg:    cfg,
   142  		log:    log,
   143  		limits: limits,
   144  
   145  		pendingRequests:    map[requestKey]*schedulerRequest{},
   146  		connectedFrontends: map[string]*connectedFrontend{},
   147  	}
   148  
   149  	s.queueLength = promauto.With(registerer).NewGaugeVec(prometheus.GaugeOpts{
   150  		Name: "cortex_query_scheduler_queue_length",
   151  		Help: "Number of queries in the queue.",
   152  	}, []string{"user"})
   153  
   154  	s.discardedRequests = promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{
   155  		Name: "cortex_query_scheduler_discarded_requests_total",
   156  		Help: "Total number of query requests discarded.",
   157  	}, []string{"user"})
   158  	s.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, cfg.QuerierForgetDelay, s.queueLength, s.discardedRequests)
   159  
   160  	s.queueDuration = promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
   161  		Name:    "cortex_query_scheduler_queue_duration_seconds",
   162  		Help:    "Time spend by requests in queue before getting picked up by a querier.",
   163  		Buckets: prometheus.DefBuckets,
   164  	})
   165  	s.connectedQuerierClients = promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{
   166  		Name: "cortex_query_scheduler_connected_querier_clients",
   167  		Help: "Number of querier worker clients currently connected to the query-scheduler.",
   168  	}, s.requestQueue.GetConnectedQuerierWorkersMetric)
   169  	s.connectedFrontendClients = promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{
   170  		Name: "cortex_query_scheduler_connected_frontend_clients",
   171  		Help: "Number of query-frontend worker clients currently connected to the query-scheduler.",
   172  	}, s.getConnectedFrontendClientsMetric)
   173  	s.schedulerRunning = promauto.With(registerer).NewGauge(prometheus.GaugeOpts{
   174  		Name: "cortex_query_scheduler_running",
   175  		Help: "Value will be 1 if the scheduler is in the ReplicationSet and actively receiving/processing requests",
   176  	})
   177  	s.inflightRequests = promauto.With(registerer).NewSummary(prometheus.SummaryOpts{
   178  		Name:       "cortex_query_scheduler_inflight_requests",
   179  		Help:       "Number of inflight requests (either queued or processing) sampled at a regular interval. Quantile buckets keep track of inflight requests over the last 60s.",
   180  		Objectives: map[float64]float64{0.5: 0.05, 0.75: 0.02, 0.8: 0.02, 0.9: 0.01, 0.95: 0.01, 0.99: 0.001},
   181  		MaxAge:     time.Minute,
   182  		AgeBuckets: 6,
   183  	})
   184  
   185  	s.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(s.cleanupMetricsForInactiveUser)
   186  
   187  	svcs := []services.Service{s.requestQueue, s.activeUsers}
   188  
   189  	if cfg.UseSchedulerRing {
   190  		s.shouldRun.Store(false)
   191  		ringStore, err := kv.NewClient(
   192  			cfg.SchedulerRing.KVStore,
   193  			ring.GetCodec(),
   194  			kv.RegistererWithKVName(prometheus.WrapRegistererWithPrefix("loki_", registerer), "scheduler"),
   195  			log,
   196  		)
   197  		if err != nil {
   198  			return nil, errors.Wrap(err, "create KV store client")
   199  		}
   200  		lifecyclerCfg, err := cfg.SchedulerRing.ToLifecyclerConfig(ringNumTokens, log)
   201  		if err != nil {
   202  			return nil, errors.Wrap(err, "invalid ring lifecycler config")
   203  		}
   204  
   205  		// Define lifecycler delegates in reverse order (last to be called defined first because they're
   206  		// chained via "next delegate").
   207  		delegate := ring.BasicLifecyclerDelegate(s)
   208  		delegate = ring.NewLeaveOnStoppingDelegate(delegate, log)
   209  		delegate = ring.NewTokensPersistencyDelegate(cfg.SchedulerRing.TokensFilePath, ring.JOINING, delegate, log)
   210  		delegate = ring.NewAutoForgetDelegate(ringAutoForgetUnhealthyPeriods*cfg.SchedulerRing.HeartbeatTimeout, delegate, log)
   211  
   212  		s.ringLifecycler, err = ring.NewBasicLifecycler(lifecyclerCfg, ringNameForServer, ringKey, ringStore, delegate, log, registerer)
   213  		if err != nil {
   214  			return nil, errors.Wrap(err, "create ring lifecycler")
   215  		}
   216  
   217  		ringCfg := cfg.SchedulerRing.ToRingConfig(ringReplicationFactor)
   218  		s.ring, err = ring.NewWithStoreClientAndStrategy(ringCfg, ringNameForServer, ringKey, ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), prometheus.WrapRegistererWithPrefix("cortex_", registerer), util_log.Logger)
   219  		if err != nil {
   220  			return nil, errors.Wrap(err, "create ring client")
   221  		}
   222  
   223  		svcs = append(svcs, s.ringLifecycler, s.ring)
   224  	} else {
   225  		// Always run if no scheduler ring is being used.
   226  		s.shouldRun.Store(true)
   227  	}
   228  
   229  	var err error
   230  	s.subservices, err = services.NewManager(svcs...)
   231  	if err != nil {
   232  		return nil, err
   233  	}
   234  	s.subservicesWatcher = services.NewFailureWatcher()
   235  	s.subservicesWatcher.WatchManager(s.subservices)
   236  
   237  	s.Service = services.NewBasicService(s.starting, s.running, s.stopping)
   238  	return s, nil
   239  }
   240  
   241  // Limits needed for the Query Scheduler - interface used for decoupling.
   242  type Limits interface {
   243  	// MaxQueriersPerUser returns max queriers to use per tenant, or 0 if shuffle sharding is disabled.
   244  	MaxQueriersPerUser(user string) int
   245  }
   246  
   247  type schedulerRequest struct {
   248  	frontendAddress string
   249  	userID          string
   250  	queryID         uint64
   251  	request         *httpgrpc.HTTPRequest
   252  	statsEnabled    bool
   253  
   254  	queueTime time.Time
   255  
   256  	ctx       context.Context
   257  	ctxCancel context.CancelFunc
   258  	queueSpan opentracing.Span
   259  
   260  	// This is only used for testing.
   261  	parentSpanContext opentracing.SpanContext
   262  }
   263  
   264  // FrontendLoop handles connection from frontend.
   265  func (s *Scheduler) FrontendLoop(frontend schedulerpb.SchedulerForFrontend_FrontendLoopServer) error {
   266  	frontendAddress, frontendCtx, err := s.frontendConnected(frontend)
   267  	if err != nil {
   268  		return err
   269  	}
   270  	defer s.frontendDisconnected(frontendAddress)
   271  
   272  	// Response to INIT. If scheduler is not running, we skip for-loop, send SHUTTING_DOWN and exit this method.
   273  	if s.State() == services.Running && s.shouldRun.Load() {
   274  		if err := frontend.Send(&schedulerpb.SchedulerToFrontend{Status: schedulerpb.OK}); err != nil {
   275  			return err
   276  		}
   277  	}
   278  
   279  	// We stop accepting new queries in Stopping state. By returning quickly, we disconnect frontends, which in turns
   280  	// cancels all their queries.
   281  	for s.State() == services.Running {
   282  		msg, err := frontend.Recv()
   283  		if err != nil {
   284  			// No need to report this as error, it is expected when query-frontend performs SendClose() (as frontendSchedulerWorker does).
   285  			if err == io.EOF {
   286  				return nil
   287  			}
   288  			return err
   289  		}
   290  
   291  		if s.State() != services.Running {
   292  			break // break out of the loop, and send SHUTTING_DOWN message.
   293  		}
   294  
   295  		var resp *schedulerpb.SchedulerToFrontend
   296  
   297  		switch msg.GetType() {
   298  		case schedulerpb.ENQUEUE:
   299  			err = s.enqueueRequest(frontendCtx, frontendAddress, msg)
   300  			switch {
   301  			case err == nil:
   302  				resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.OK}
   303  			case err == queue.ErrTooManyRequests:
   304  				resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.TOO_MANY_REQUESTS_PER_TENANT}
   305  			default:
   306  				resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.ERROR, Error: err.Error()}
   307  			}
   308  
   309  		case schedulerpb.CANCEL:
   310  			s.cancelRequestAndRemoveFromPending(frontendAddress, msg.QueryID)
   311  			resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.OK}
   312  
   313  		default:
   314  			level.Error(s.log).Log("msg", "unknown request type from frontend", "addr", frontendAddress, "type", msg.GetType())
   315  			return errors.New("unknown request type")
   316  		}
   317  
   318  		err = frontend.Send(resp)
   319  		// Failure to send response results in ending this connection.
   320  		if err != nil {
   321  			return err
   322  		}
   323  	}
   324  
   325  	// Report shutdown back to frontend, so that it can retry with different scheduler. Also stop the frontend loop.
   326  	return frontend.Send(&schedulerpb.SchedulerToFrontend{Status: schedulerpb.SHUTTING_DOWN})
   327  }
   328  
   329  func (s *Scheduler) frontendConnected(frontend schedulerpb.SchedulerForFrontend_FrontendLoopServer) (string, context.Context, error) {
   330  	msg, err := frontend.Recv()
   331  	if err != nil {
   332  		return "", nil, err
   333  	}
   334  	if msg.Type != schedulerpb.INIT || msg.FrontendAddress == "" {
   335  		return "", nil, errors.New("no frontend address")
   336  	}
   337  
   338  	level.Debug(s.log).Log("msg", "frontend connected", "address", msg.FrontendAddress)
   339  
   340  	s.connectedFrontendsMu.Lock()
   341  	defer s.connectedFrontendsMu.Unlock()
   342  
   343  	cf := s.connectedFrontends[msg.FrontendAddress]
   344  	if cf == nil {
   345  		cf = &connectedFrontend{
   346  			connections: 0,
   347  			frontend:    frontend,
   348  		}
   349  		cf.ctx, cf.cancel = context.WithCancel(context.Background())
   350  		s.connectedFrontends[msg.FrontendAddress] = cf
   351  	}
   352  
   353  	cf.connections++
   354  	return msg.FrontendAddress, cf.ctx, nil
   355  }
   356  
   357  func (s *Scheduler) frontendDisconnected(frontendAddress string) {
   358  	s.connectedFrontendsMu.Lock()
   359  	defer s.connectedFrontendsMu.Unlock()
   360  
   361  	level.Debug(s.log).Log("msg", "frontend disconnected", "address", frontendAddress)
   362  
   363  	cf := s.connectedFrontends[frontendAddress]
   364  	cf.connections--
   365  	if cf.connections == 0 {
   366  		delete(s.connectedFrontends, frontendAddress)
   367  		cf.cancel()
   368  	}
   369  }
   370  
   371  func (s *Scheduler) enqueueRequest(frontendContext context.Context, frontendAddr string, msg *schedulerpb.FrontendToScheduler) error {
   372  	// Create new context for this request, to support cancellation.
   373  	ctx, cancel := context.WithCancel(frontendContext)
   374  	shouldCancel := true
   375  	defer func() {
   376  		if shouldCancel {
   377  			cancel()
   378  		}
   379  	}()
   380  
   381  	// Extract tracing information from headers in HTTP request. FrontendContext doesn't have the correct tracing
   382  	// information, since that is a long-running request.
   383  	tracer := opentracing.GlobalTracer()
   384  	parentSpanContext, err := lokigrpc.GetParentSpanForRequest(tracer, msg.HttpRequest)
   385  	if err != nil {
   386  		return err
   387  	}
   388  
   389  	userID := msg.GetUserID()
   390  
   391  	req := &schedulerRequest{
   392  		frontendAddress: frontendAddr,
   393  		userID:          msg.UserID,
   394  		queryID:         msg.QueryID,
   395  		request:         msg.HttpRequest,
   396  		statsEnabled:    msg.StatsEnabled,
   397  	}
   398  
   399  	now := time.Now()
   400  
   401  	req.parentSpanContext = parentSpanContext
   402  	req.queueSpan, req.ctx = opentracing.StartSpanFromContextWithTracer(ctx, tracer, "queued", opentracing.ChildOf(parentSpanContext))
   403  	req.queueTime = now
   404  	req.ctxCancel = cancel
   405  
   406  	// aggregate the max queriers limit in the case of a multi tenant query
   407  	tenantIDs, err := tenant.TenantIDsFromOrgID(userID)
   408  	if err != nil {
   409  		return err
   410  	}
   411  	maxQueriers := validation.SmallestPositiveNonZeroIntPerTenant(tenantIDs, s.limits.MaxQueriersPerUser)
   412  
   413  	s.activeUsers.UpdateUserTimestamp(userID, now)
   414  	return s.requestQueue.EnqueueRequest(userID, req, maxQueriers, func() {
   415  		shouldCancel = false
   416  
   417  		s.pendingRequestsMu.Lock()
   418  		defer s.pendingRequestsMu.Unlock()
   419  		s.pendingRequests[requestKey{frontendAddr: frontendAddr, queryID: msg.QueryID}] = req
   420  	})
   421  }
   422  
   423  // This method doesn't do removal from the queue.
   424  func (s *Scheduler) cancelRequestAndRemoveFromPending(frontendAddr string, queryID uint64) {
   425  	s.pendingRequestsMu.Lock()
   426  	defer s.pendingRequestsMu.Unlock()
   427  
   428  	key := requestKey{frontendAddr: frontendAddr, queryID: queryID}
   429  	req := s.pendingRequests[key]
   430  	if req != nil {
   431  		req.ctxCancel()
   432  	}
   433  	delete(s.pendingRequests, key)
   434  }
   435  
   436  // QuerierLoop is started by querier to receive queries from scheduler.
   437  func (s *Scheduler) QuerierLoop(querier schedulerpb.SchedulerForQuerier_QuerierLoopServer) error {
   438  	resp, err := querier.Recv()
   439  	if err != nil {
   440  		return err
   441  	}
   442  
   443  	querierID := resp.GetQuerierID()
   444  	level.Debug(s.log).Log("msg", "querier connected", "querier", querierID)
   445  
   446  	s.requestQueue.RegisterQuerierConnection(querierID)
   447  	defer s.requestQueue.UnregisterQuerierConnection(querierID)
   448  
   449  	lastUserIndex := queue.FirstUser()
   450  
   451  	// In stopping state scheduler is not accepting new queries, but still dispatching queries in the queues.
   452  	for s.isRunningOrStopping() {
   453  		req, idx, err := s.requestQueue.GetNextRequestForQuerier(querier.Context(), lastUserIndex, querierID)
   454  		if err != nil {
   455  			return err
   456  		}
   457  		lastUserIndex = idx
   458  
   459  		r := req.(*schedulerRequest)
   460  
   461  		reqQueueTime := time.Since(r.queueTime)
   462  		s.queueDuration.Observe(reqQueueTime.Seconds())
   463  		r.queueSpan.Finish()
   464  
   465  		// Add HTTP header to the request containing the query queue time
   466  		r.request.Headers = append(r.request.Headers, &httpgrpc.Header{
   467  			Key:    textproto.CanonicalMIMEHeaderKey(string(lokihttpreq.QueryQueueTimeHTTPHeader)),
   468  			Values: []string{reqQueueTime.String()},
   469  		})
   470  
   471  		/*
   472  		  We want to dequeue the next unexpired request from the chosen tenant queue.
   473  		  The chance of choosing a particular tenant for dequeueing is (1/active_tenants).
   474  		  This is problematic under load, especially with other middleware enabled such as
   475  		  querier.split-by-interval, where one request may fan out into many.
   476  		  If expired requests aren't exhausted before checking another tenant, it would take
   477  		  n_active_tenants * n_expired_requests_at_front_of_queue requests being processed
   478  		  before an active request was handled for the tenant in question.
   479  		  If this tenant meanwhile continued to queue requests,
   480  		  it's possible that it's own queue would perpetually contain only expired requests.
   481  		*/
   482  
   483  		if r.ctx.Err() != nil {
   484  			// Remove from pending requests.
   485  			s.cancelRequestAndRemoveFromPending(r.frontendAddress, r.queryID)
   486  
   487  			lastUserIndex = lastUserIndex.ReuseLastUser()
   488  			continue
   489  		}
   490  
   491  		if err := s.forwardRequestToQuerier(querier, r); err != nil {
   492  			return err
   493  		}
   494  	}
   495  
   496  	return errSchedulerIsNotRunning
   497  }
   498  
   499  func (s *Scheduler) NotifyQuerierShutdown(_ context.Context, req *schedulerpb.NotifyQuerierShutdownRequest) (*schedulerpb.NotifyQuerierShutdownResponse, error) {
   500  	level.Debug(s.log).Log("msg", "received shutdown notification from querier", "querier", req.GetQuerierID())
   501  	s.requestQueue.NotifyQuerierShutdown(req.GetQuerierID())
   502  
   503  	return &schedulerpb.NotifyQuerierShutdownResponse{}, nil
   504  }
   505  
   506  func (s *Scheduler) forwardRequestToQuerier(querier schedulerpb.SchedulerForQuerier_QuerierLoopServer, req *schedulerRequest) error {
   507  	// Make sure to cancel request at the end to cleanup resources.
   508  	defer s.cancelRequestAndRemoveFromPending(req.frontendAddress, req.queryID)
   509  
   510  	// Handle the stream sending & receiving on a goroutine so we can
   511  	// monitoring the contexts in a select and cancel things appropriately.
   512  	errCh := make(chan error, 1)
   513  	go func() {
   514  		err := querier.Send(&schedulerpb.SchedulerToQuerier{
   515  			UserID:          req.userID,
   516  			QueryID:         req.queryID,
   517  			FrontendAddress: req.frontendAddress,
   518  			HttpRequest:     req.request,
   519  			StatsEnabled:    req.statsEnabled,
   520  		})
   521  		if err != nil {
   522  			errCh <- err
   523  			return
   524  		}
   525  
   526  		_, err = querier.Recv()
   527  		errCh <- err
   528  	}()
   529  
   530  	select {
   531  	case <-req.ctx.Done():
   532  		// If the upstream request is cancelled (eg. frontend issued CANCEL or closed connection),
   533  		// we need to cancel the downstream req. Only way we can do that is to close the stream (by returning error here).
   534  		// Querier is expecting this semantics.
   535  		return req.ctx.Err()
   536  
   537  	case err := <-errCh:
   538  		// Is there was an error handling this request due to network IO,
   539  		// then error out this upstream request _and_ stream.
   540  
   541  		if err != nil {
   542  			s.forwardErrorToFrontend(req.ctx, req, err)
   543  		}
   544  		return err
   545  	}
   546  }
   547  
   548  func (s *Scheduler) forwardErrorToFrontend(ctx context.Context, req *schedulerRequest, requestErr error) {
   549  	opts, err := s.cfg.GRPCClientConfig.DialOption([]grpc.UnaryClientInterceptor{
   550  		otgrpc.OpenTracingClientInterceptor(opentracing.GlobalTracer()),
   551  		middleware.ClientUserHeaderInterceptor,
   552  	},
   553  		nil)
   554  	if err != nil {
   555  		level.Warn(s.log).Log("msg", "failed to create gRPC options for the connection to frontend to report error", "frontend", req.frontendAddress, "err", err, "requestErr", requestErr)
   556  		return
   557  	}
   558  
   559  	conn, err := grpc.DialContext(ctx, req.frontendAddress, opts...)
   560  	if err != nil {
   561  		level.Warn(s.log).Log("msg", "failed to create gRPC connection to frontend to report error", "frontend", req.frontendAddress, "err", err, "requestErr", requestErr)
   562  		return
   563  	}
   564  
   565  	defer func() {
   566  		_ = conn.Close()
   567  	}()
   568  
   569  	client := frontendv2pb.NewFrontendForQuerierClient(conn)
   570  
   571  	userCtx := user.InjectOrgID(ctx, req.userID)
   572  	_, err = client.QueryResult(userCtx, &frontendv2pb.QueryResultRequest{
   573  		QueryID: req.queryID,
   574  		HttpResponse: &httpgrpc.HTTPResponse{
   575  			Code: http.StatusInternalServerError,
   576  			Body: []byte(requestErr.Error()),
   577  		},
   578  	})
   579  
   580  	if err != nil {
   581  		level.Warn(s.log).Log("msg", "failed to forward error to frontend", "frontend", req.frontendAddress, "err", err, "requestErr", requestErr)
   582  		return
   583  	}
   584  }
   585  
   586  func (s *Scheduler) isRunningOrStopping() bool {
   587  	st := s.State()
   588  	return st == services.Running || st == services.Stopping
   589  }
   590  
   591  func (s *Scheduler) starting(ctx context.Context) (err error) {
   592  	// In case this function will return error we want to unregister the instance
   593  	// from the ring. We do it ensuring dependencies are gracefully stopped if they
   594  	// were already started.
   595  	defer func() {
   596  		if err == nil || s.subservices == nil {
   597  			return
   598  		}
   599  
   600  		if stopErr := services.StopManagerAndAwaitStopped(context.Background(), s.subservices); stopErr != nil {
   601  			level.Error(s.log).Log("msg", "failed to gracefully stop scheduler dependencies", "err", stopErr)
   602  		}
   603  	}()
   604  
   605  	if err := services.StartManagerAndAwaitHealthy(ctx, s.subservices); err != nil {
   606  		return errors.Wrap(err, "unable to start scheduler subservices")
   607  	}
   608  
   609  	if s.cfg.UseSchedulerRing {
   610  		// The BasicLifecycler does not automatically move state to ACTIVE such that any additional work that
   611  		// someone wants to do can be done before becoming ACTIVE. For the query scheduler we don't currently
   612  		// have any additional work so we can become ACTIVE right away.
   613  
   614  		// Wait until the ring client detected this instance in the JOINING state to
   615  		// make sure that when we'll run the initial sync we already know  the tokens
   616  		// assigned to this instance.
   617  		level.Info(s.log).Log("msg", "waiting until scheduler is JOINING in the ring")
   618  		if err := ring.WaitInstanceState(ctx, s.ring, s.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil {
   619  			return err
   620  		}
   621  		level.Info(s.log).Log("msg", "scheduler is JOINING in the ring")
   622  
   623  		// Change ring state to ACTIVE
   624  		if err = s.ringLifecycler.ChangeState(ctx, ring.ACTIVE); err != nil {
   625  			return errors.Wrapf(err, "switch instance to %s in the ring", ring.ACTIVE)
   626  		}
   627  
   628  		// Wait until the ring client detected this instance in the ACTIVE state to
   629  		// make sure that when we'll run the loop it won't be detected as a ring
   630  		// topology change.
   631  		level.Info(s.log).Log("msg", "waiting until scheduler is ACTIVE in the ring")
   632  		if err := ring.WaitInstanceState(ctx, s.ring, s.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil {
   633  			return err
   634  		}
   635  		level.Info(s.log).Log("msg", "scheduler is ACTIVE in the ring")
   636  	}
   637  
   638  	return nil
   639  }
   640  
   641  func (s *Scheduler) running(ctx context.Context) error {
   642  	// We observe inflight requests frequently and at regular intervals, to have a good
   643  	// approximation of max inflight requests over percentiles of time. We also do it with
   644  	// a ticker so that we keep tracking it even if we have no new queries but stuck inflight
   645  	// requests (eg. queriers are all crashing).
   646  	inflightRequestsTicker := time.NewTicker(250 * time.Millisecond)
   647  	defer inflightRequestsTicker.Stop()
   648  
   649  	ringCheckTicker := time.NewTicker(ringCheckPeriod)
   650  	defer ringCheckTicker.Stop()
   651  
   652  	for {
   653  		select {
   654  		case <-ctx.Done():
   655  			return nil
   656  		case err := <-s.subservicesWatcher.Chan():
   657  			return errors.Wrap(err, "scheduler subservice failed")
   658  		case <-ringCheckTicker.C:
   659  			if !s.cfg.UseSchedulerRing {
   660  				continue
   661  			}
   662  			isInSet, err := lokiutil.IsInReplicationSet(s.ring, lokiutil.RingKeyOfLeader, s.ringLifecycler.GetInstanceAddr())
   663  			if err != nil {
   664  				level.Error(s.log).Log("msg", "failed to query the ring to see if scheduler instance is in ReplicatonSet, will try again", "err", err)
   665  				continue
   666  			}
   667  			s.setRunState(isInSet)
   668  		case <-inflightRequestsTicker.C:
   669  			s.pendingRequestsMu.Lock()
   670  			inflight := len(s.pendingRequests)
   671  			s.pendingRequestsMu.Unlock()
   672  
   673  			s.inflightRequests.Observe(float64(inflight))
   674  		}
   675  	}
   676  }
   677  
   678  func (s *Scheduler) setRunState(isInSet bool) {
   679  	if isInSet {
   680  		if s.shouldRun.CAS(false, true) {
   681  			// Value was swapped, meaning this was a state change from stopped to running.
   682  			level.Info(s.log).Log("msg", "this scheduler is in the ReplicationSet, will now accept requests.")
   683  			s.schedulerRunning.Set(1)
   684  		}
   685  	} else {
   686  		if s.shouldRun.CAS(true, false) {
   687  			// Value was swapped, meaning this was a state change from running to stopped,
   688  			// we need to send shutdown to all the connected frontends.
   689  			level.Info(s.log).Log("msg", "this scheduler is no longer in the ReplicationSet, disconnecting frontends, canceling queries and no longer accepting requests.")
   690  
   691  			// Send a shutdown message to the connected frontends, there is no way to break the blocking Recv() in the FrontendLoop()
   692  			// so we send a message to the frontend telling them we are shutting down so they will disconnect.
   693  			// When FrontendLoop() exits for the connected querier all the inflight queries and queued queries will be canceled.
   694  			s.connectedFrontendsMu.Lock()
   695  			defer s.connectedFrontendsMu.Unlock()
   696  			for _, f := range s.connectedFrontends {
   697  				// We ignore any errors here because there isn't really an action to take and because
   698  				// the frontends are also discovering the ring changes and may already be disconnected
   699  				// or have disconnected.
   700  				_ = f.frontend.Send(&schedulerpb.SchedulerToFrontend{Status: schedulerpb.SHUTTING_DOWN})
   701  			}
   702  			s.schedulerRunning.Set(0)
   703  		}
   704  	}
   705  }
   706  
   707  // Close the Scheduler.
   708  func (s *Scheduler) stopping(_ error) error {
   709  	// This will also stop the requests queue, which stop accepting new requests and errors out any pending requests.
   710  	return services.StopManagerAndAwaitStopped(context.Background(), s.subservices)
   711  }
   712  
   713  func (s *Scheduler) cleanupMetricsForInactiveUser(user string) {
   714  	s.queueLength.DeleteLabelValues(user)
   715  	s.discardedRequests.DeleteLabelValues(user)
   716  }
   717  
   718  func (s *Scheduler) getConnectedFrontendClientsMetric() float64 {
   719  	s.connectedFrontendsMu.Lock()
   720  	defer s.connectedFrontendsMu.Unlock()
   721  
   722  	count := 0
   723  	for _, workers := range s.connectedFrontends {
   724  		count += workers.connections
   725  	}
   726  
   727  	return float64(count)
   728  }
   729  
   730  // SafeReadRing does a nil check on the Scheduler before attempting to return it's ring
   731  // this is necessary as many callers of this function will only have a valid Scheduler
   732  // reference if the QueryScheduler target has been specified, which is not guaranteed
   733  func SafeReadRing(s *Scheduler) ring.ReadRing {
   734  	if s == nil || s.ring == nil || !s.cfg.UseSchedulerRing {
   735  		return nil
   736  	}
   737  
   738  	return s.ring
   739  }
   740  
   741  func (s *Scheduler) OnRingInstanceRegister(_ *ring.BasicLifecycler, ringDesc ring.Desc, instanceExists bool, instanceID string, instanceDesc ring.InstanceDesc) (ring.InstanceState, ring.Tokens) {
   742  	// When we initialize the scheduler instance in the ring we want to start from
   743  	// a clean situation, so whatever is the state we set it JOINING, while we keep existing
   744  	// tokens (if any) or the ones loaded from file.
   745  	var tokens []uint32
   746  	if instanceExists {
   747  		tokens = instanceDesc.GetTokens()
   748  	}
   749  
   750  	takenTokens := ringDesc.GetTokens()
   751  	newTokens := ring.GenerateTokens(ringNumTokens-len(tokens), takenTokens)
   752  
   753  	// Tokens sorting will be enforced by the parent caller.
   754  	tokens = append(tokens, newTokens...)
   755  
   756  	return ring.JOINING, tokens
   757  }
   758  
   759  func (s *Scheduler) OnRingInstanceTokens(_ *ring.BasicLifecycler, _ ring.Tokens) {}
   760  func (s *Scheduler) OnRingInstanceStopping(_ *ring.BasicLifecycler)              {}
   761  func (s *Scheduler) OnRingInstanceHeartbeat(_ *ring.BasicLifecycler, _ *ring.Desc, _ *ring.InstanceDesc) {
   762  }
   763  
   764  func (s *Scheduler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
   765  	s.ring.ServeHTTP(w, req)
   766  }