github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/scheduler/scheduler.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"context"
     5  	"flag"
     6  	"io"
     7  	"net/http"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/go-kit/log"
    12  	"github.com/go-kit/log/level"
    13  	"github.com/grafana/dskit/grpcclient"
    14  	"github.com/grafana/dskit/services"
    15  	otgrpc "github.com/opentracing-contrib/go-grpc"
    16  	"github.com/opentracing/opentracing-go"
    17  	"github.com/pkg/errors"
    18  	"github.com/prometheus/client_golang/prometheus"
    19  	"github.com/prometheus/client_golang/prometheus/promauto"
    20  	"github.com/weaveworks/common/httpgrpc"
    21  	"github.com/weaveworks/common/middleware"
    22  	"github.com/weaveworks/common/user"
    23  	"google.golang.org/grpc"
    24  
    25  	"github.com/cortexproject/cortex/pkg/frontend/v2/frontendv2pb"
    26  	"github.com/cortexproject/cortex/pkg/scheduler/queue"
    27  	"github.com/cortexproject/cortex/pkg/scheduler/schedulerpb"
    28  	"github.com/cortexproject/cortex/pkg/tenant"
    29  	"github.com/cortexproject/cortex/pkg/util"
    30  	"github.com/cortexproject/cortex/pkg/util/httpgrpcutil"
    31  	"github.com/cortexproject/cortex/pkg/util/validation"
    32  )
    33  
    34  var (
    35  	errSchedulerIsNotRunning = errors.New("scheduler is not running")
    36  )
    37  
    38  // Scheduler is responsible for queueing and dispatching queries to Queriers.
    39  type Scheduler struct {
    40  	services.Service
    41  
    42  	cfg Config
    43  	log log.Logger
    44  
    45  	limits Limits
    46  
    47  	connectedFrontendsMu sync.Mutex
    48  	connectedFrontends   map[string]*connectedFrontend
    49  
    50  	requestQueue *queue.RequestQueue
    51  	activeUsers  *util.ActiveUsersCleanupService
    52  
    53  	pendingRequestsMu sync.Mutex
    54  	pendingRequests   map[requestKey]*schedulerRequest // Request is kept in this map even after being dispatched to querier. It can still be canceled at that time.
    55  
    56  	// Subservices manager.
    57  	subservices        *services.Manager
    58  	subservicesWatcher *services.FailureWatcher
    59  
    60  	// Metrics.
    61  	queueLength              *prometheus.GaugeVec
    62  	discardedRequests        *prometheus.CounterVec
    63  	connectedQuerierClients  prometheus.GaugeFunc
    64  	connectedFrontendClients prometheus.GaugeFunc
    65  	queueDuration            prometheus.Histogram
    66  }
    67  
    68  type requestKey struct {
    69  	frontendAddr string
    70  	queryID      uint64
    71  }
    72  
    73  type connectedFrontend struct {
    74  	connections int
    75  
    76  	// This context is used for running all queries from the same frontend.
    77  	// When last frontend connection is closed, context is canceled.
    78  	ctx    context.Context
    79  	cancel context.CancelFunc
    80  }
    81  
    82  type Config struct {
    83  	MaxOutstandingPerTenant int               `yaml:"max_outstanding_requests_per_tenant"`
    84  	QuerierForgetDelay      time.Duration     `yaml:"querier_forget_delay"`
    85  	GRPCClientConfig        grpcclient.Config `yaml:"grpc_client_config" doc:"description=This configures the gRPC client used to report errors back to the query-frontend."`
    86  }
    87  
    88  func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
    89  	f.IntVar(&cfg.MaxOutstandingPerTenant, "query-scheduler.max-outstanding-requests-per-tenant", 100, "Maximum number of outstanding requests per tenant per query-scheduler. In-flight requests above this limit will fail with HTTP response status code 429.")
    90  	f.DurationVar(&cfg.QuerierForgetDelay, "query-scheduler.querier-forget-delay", 0, "If a querier disconnects without sending notification about graceful shutdown, the query-scheduler will keep the querier in the tenant's shard until the forget delay has passed. This feature is useful to reduce the blast radius when shuffle-sharding is enabled.")
    91  	cfg.GRPCClientConfig.RegisterFlagsWithPrefix("query-scheduler.grpc-client-config", f)
    92  }
    93  
    94  // NewScheduler creates a new Scheduler.
    95  func NewScheduler(cfg Config, limits Limits, log log.Logger, registerer prometheus.Registerer) (*Scheduler, error) {
    96  	s := &Scheduler{
    97  		cfg:    cfg,
    98  		log:    log,
    99  		limits: limits,
   100  
   101  		pendingRequests:    map[requestKey]*schedulerRequest{},
   102  		connectedFrontends: map[string]*connectedFrontend{},
   103  	}
   104  
   105  	s.queueLength = promauto.With(registerer).NewGaugeVec(prometheus.GaugeOpts{
   106  		Name: "cortex_query_scheduler_queue_length",
   107  		Help: "Number of queries in the queue.",
   108  	}, []string{"user"})
   109  
   110  	s.discardedRequests = promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{
   111  		Name: "cortex_query_scheduler_discarded_requests_total",
   112  		Help: "Total number of query requests discarded.",
   113  	}, []string{"user"})
   114  	s.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, cfg.QuerierForgetDelay, s.queueLength, s.discardedRequests)
   115  
   116  	s.queueDuration = promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
   117  		Name:    "cortex_query_scheduler_queue_duration_seconds",
   118  		Help:    "Time spend by requests in queue before getting picked up by a querier.",
   119  		Buckets: prometheus.DefBuckets,
   120  	})
   121  	s.connectedQuerierClients = promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{
   122  		Name: "cortex_query_scheduler_connected_querier_clients",
   123  		Help: "Number of querier worker clients currently connected to the query-scheduler.",
   124  	}, s.requestQueue.GetConnectedQuerierWorkersMetric)
   125  	s.connectedFrontendClients = promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{
   126  		Name: "cortex_query_scheduler_connected_frontend_clients",
   127  		Help: "Number of query-frontend worker clients currently connected to the query-scheduler.",
   128  	}, s.getConnectedFrontendClientsMetric)
   129  
   130  	s.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(s.cleanupMetricsForInactiveUser)
   131  
   132  	var err error
   133  	s.subservices, err = services.NewManager(s.requestQueue, s.activeUsers)
   134  	if err != nil {
   135  		return nil, err
   136  	}
   137  
   138  	s.Service = services.NewBasicService(s.starting, s.running, s.stopping)
   139  	return s, nil
   140  }
   141  
   142  // Limits needed for the Query Scheduler - interface used for decoupling.
   143  type Limits interface {
   144  	// MaxQueriersPerUser returns max queriers to use per tenant, or 0 if shuffle sharding is disabled.
   145  	MaxQueriersPerUser(user string) int
   146  }
   147  
   148  type schedulerRequest struct {
   149  	frontendAddress string
   150  	userID          string
   151  	queryID         uint64
   152  	request         *httpgrpc.HTTPRequest
   153  	statsEnabled    bool
   154  
   155  	enqueueTime time.Time
   156  
   157  	ctx       context.Context
   158  	ctxCancel context.CancelFunc
   159  	queueSpan opentracing.Span
   160  
   161  	// This is only used for testing.
   162  	parentSpanContext opentracing.SpanContext
   163  }
   164  
   165  // FrontendLoop handles connection from frontend.
   166  func (s *Scheduler) FrontendLoop(frontend schedulerpb.SchedulerForFrontend_FrontendLoopServer) error {
   167  	frontendAddress, frontendCtx, err := s.frontendConnected(frontend)
   168  	if err != nil {
   169  		return err
   170  	}
   171  	defer s.frontendDisconnected(frontendAddress)
   172  
   173  	// Response to INIT. If scheduler is not running, we skip for-loop, send SHUTTING_DOWN and exit this method.
   174  	if s.State() == services.Running {
   175  		if err := frontend.Send(&schedulerpb.SchedulerToFrontend{Status: schedulerpb.OK}); err != nil {
   176  			return err
   177  		}
   178  	}
   179  
   180  	// We stop accepting new queries in Stopping state. By returning quickly, we disconnect frontends, which in turns
   181  	// cancels all their queries.
   182  	for s.State() == services.Running {
   183  		msg, err := frontend.Recv()
   184  		if err != nil {
   185  			// No need to report this as error, it is expected when query-frontend performs SendClose() (as frontendSchedulerWorker does).
   186  			if err == io.EOF {
   187  				return nil
   188  			}
   189  			return err
   190  		}
   191  
   192  		if s.State() != services.Running {
   193  			break // break out of the loop, and send SHUTTING_DOWN message.
   194  		}
   195  
   196  		var resp *schedulerpb.SchedulerToFrontend
   197  
   198  		switch msg.GetType() {
   199  		case schedulerpb.ENQUEUE:
   200  			err = s.enqueueRequest(frontendCtx, frontendAddress, msg)
   201  			switch {
   202  			case err == nil:
   203  				resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.OK}
   204  			case err == queue.ErrTooManyRequests:
   205  				resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.TOO_MANY_REQUESTS_PER_TENANT}
   206  			default:
   207  				resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.ERROR, Error: err.Error()}
   208  			}
   209  
   210  		case schedulerpb.CANCEL:
   211  			s.cancelRequestAndRemoveFromPending(frontendAddress, msg.QueryID)
   212  			resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.OK}
   213  
   214  		default:
   215  			level.Error(s.log).Log("msg", "unknown request type from frontend", "addr", frontendAddress, "type", msg.GetType())
   216  			return errors.New("unknown request type")
   217  		}
   218  
   219  		err = frontend.Send(resp)
   220  		// Failure to send response results in ending this connection.
   221  		if err != nil {
   222  			return err
   223  		}
   224  	}
   225  
   226  	// Report shutdown back to frontend, so that it can retry with different scheduler. Also stop the frontend loop.
   227  	return frontend.Send(&schedulerpb.SchedulerToFrontend{Status: schedulerpb.SHUTTING_DOWN})
   228  }
   229  
   230  func (s *Scheduler) frontendConnected(frontend schedulerpb.SchedulerForFrontend_FrontendLoopServer) (string, context.Context, error) {
   231  	msg, err := frontend.Recv()
   232  	if err != nil {
   233  		return "", nil, err
   234  	}
   235  	if msg.Type != schedulerpb.INIT || msg.FrontendAddress == "" {
   236  		return "", nil, errors.New("no frontend address")
   237  	}
   238  
   239  	s.connectedFrontendsMu.Lock()
   240  	defer s.connectedFrontendsMu.Unlock()
   241  
   242  	cf := s.connectedFrontends[msg.FrontendAddress]
   243  	if cf == nil {
   244  		cf = &connectedFrontend{
   245  			connections: 0,
   246  		}
   247  		cf.ctx, cf.cancel = context.WithCancel(context.Background())
   248  		s.connectedFrontends[msg.FrontendAddress] = cf
   249  	}
   250  
   251  	cf.connections++
   252  	return msg.FrontendAddress, cf.ctx, nil
   253  }
   254  
   255  func (s *Scheduler) frontendDisconnected(frontendAddress string) {
   256  	s.connectedFrontendsMu.Lock()
   257  	defer s.connectedFrontendsMu.Unlock()
   258  
   259  	cf := s.connectedFrontends[frontendAddress]
   260  	cf.connections--
   261  	if cf.connections == 0 {
   262  		delete(s.connectedFrontends, frontendAddress)
   263  		cf.cancel()
   264  	}
   265  }
   266  
   267  func (s *Scheduler) enqueueRequest(frontendContext context.Context, frontendAddr string, msg *schedulerpb.FrontendToScheduler) error {
   268  	// Create new context for this request, to support cancellation.
   269  	ctx, cancel := context.WithCancel(frontendContext)
   270  	shouldCancel := true
   271  	defer func() {
   272  		if shouldCancel {
   273  			cancel()
   274  		}
   275  	}()
   276  
   277  	// Extract tracing information from headers in HTTP request. FrontendContext doesn't have the correct tracing
   278  	// information, since that is a long-running request.
   279  	tracer := opentracing.GlobalTracer()
   280  	parentSpanContext, err := httpgrpcutil.GetParentSpanForRequest(tracer, msg.HttpRequest)
   281  	if err != nil {
   282  		return err
   283  	}
   284  
   285  	userID := msg.GetUserID()
   286  
   287  	req := &schedulerRequest{
   288  		frontendAddress: frontendAddr,
   289  		userID:          msg.UserID,
   290  		queryID:         msg.QueryID,
   291  		request:         msg.HttpRequest,
   292  		statsEnabled:    msg.StatsEnabled,
   293  	}
   294  
   295  	now := time.Now()
   296  
   297  	req.parentSpanContext = parentSpanContext
   298  	req.queueSpan, req.ctx = opentracing.StartSpanFromContextWithTracer(ctx, tracer, "queued", opentracing.ChildOf(parentSpanContext))
   299  	req.enqueueTime = now
   300  	req.ctxCancel = cancel
   301  
   302  	// aggregate the max queriers limit in the case of a multi tenant query
   303  	tenantIDs, err := tenant.TenantIDsFromOrgID(userID)
   304  	if err != nil {
   305  		return err
   306  	}
   307  	maxQueriers := validation.SmallestPositiveNonZeroIntPerTenant(tenantIDs, s.limits.MaxQueriersPerUser)
   308  
   309  	s.activeUsers.UpdateUserTimestamp(userID, now)
   310  	return s.requestQueue.EnqueueRequest(userID, req, maxQueriers, func() {
   311  		shouldCancel = false
   312  
   313  		s.pendingRequestsMu.Lock()
   314  		defer s.pendingRequestsMu.Unlock()
   315  		s.pendingRequests[requestKey{frontendAddr: frontendAddr, queryID: msg.QueryID}] = req
   316  	})
   317  }
   318  
   319  // This method doesn't do removal from the queue.
   320  func (s *Scheduler) cancelRequestAndRemoveFromPending(frontendAddr string, queryID uint64) {
   321  	s.pendingRequestsMu.Lock()
   322  	defer s.pendingRequestsMu.Unlock()
   323  
   324  	key := requestKey{frontendAddr: frontendAddr, queryID: queryID}
   325  	req := s.pendingRequests[key]
   326  	if req != nil {
   327  		req.ctxCancel()
   328  	}
   329  	delete(s.pendingRequests, key)
   330  }
   331  
   332  // QuerierLoop is started by querier to receive queries from scheduler.
   333  func (s *Scheduler) QuerierLoop(querier schedulerpb.SchedulerForQuerier_QuerierLoopServer) error {
   334  	resp, err := querier.Recv()
   335  	if err != nil {
   336  		return err
   337  	}
   338  
   339  	querierID := resp.GetQuerierID()
   340  
   341  	s.requestQueue.RegisterQuerierConnection(querierID)
   342  	defer s.requestQueue.UnregisterQuerierConnection(querierID)
   343  
   344  	// If the downstream connection to querier is cancelled,
   345  	// we need to ping the condition variable to unblock getNextRequestForQuerier.
   346  	// Ideally we'd have ctx aware condition variables...
   347  	go func() {
   348  		<-querier.Context().Done()
   349  		s.requestQueue.QuerierDisconnecting()
   350  	}()
   351  
   352  	lastUserIndex := queue.FirstUser()
   353  
   354  	// In stopping state scheduler is not accepting new queries, but still dispatching queries in the queues.
   355  	for s.isRunningOrStopping() {
   356  		req, idx, err := s.requestQueue.GetNextRequestForQuerier(querier.Context(), lastUserIndex, querierID)
   357  		if err != nil {
   358  			return err
   359  		}
   360  		lastUserIndex = idx
   361  
   362  		r := req.(*schedulerRequest)
   363  
   364  		s.queueDuration.Observe(time.Since(r.enqueueTime).Seconds())
   365  		r.queueSpan.Finish()
   366  
   367  		/*
   368  		  We want to dequeue the next unexpired request from the chosen tenant queue.
   369  		  The chance of choosing a particular tenant for dequeueing is (1/active_tenants).
   370  		  This is problematic under load, especially with other middleware enabled such as
   371  		  querier.split-by-interval, where one request may fan out into many.
   372  		  If expired requests aren't exhausted before checking another tenant, it would take
   373  		  n_active_tenants * n_expired_requests_at_front_of_queue requests being processed
   374  		  before an active request was handled for the tenant in question.
   375  		  If this tenant meanwhile continued to queue requests,
   376  		  it's possible that it's own queue would perpetually contain only expired requests.
   377  		*/
   378  
   379  		if r.ctx.Err() != nil {
   380  			// Remove from pending requests.
   381  			s.cancelRequestAndRemoveFromPending(r.frontendAddress, r.queryID)
   382  
   383  			lastUserIndex = lastUserIndex.ReuseLastUser()
   384  			continue
   385  		}
   386  
   387  		if err := s.forwardRequestToQuerier(querier, r); err != nil {
   388  			return err
   389  		}
   390  	}
   391  
   392  	return errSchedulerIsNotRunning
   393  }
   394  
   395  func (s *Scheduler) NotifyQuerierShutdown(_ context.Context, req *schedulerpb.NotifyQuerierShutdownRequest) (*schedulerpb.NotifyQuerierShutdownResponse, error) {
   396  	level.Info(s.log).Log("msg", "received shutdown notification from querier", "querier", req.GetQuerierID())
   397  	s.requestQueue.NotifyQuerierShutdown(req.GetQuerierID())
   398  
   399  	return &schedulerpb.NotifyQuerierShutdownResponse{}, nil
   400  }
   401  
   402  func (s *Scheduler) forwardRequestToQuerier(querier schedulerpb.SchedulerForQuerier_QuerierLoopServer, req *schedulerRequest) error {
   403  	// Make sure to cancel request at the end to cleanup resources.
   404  	defer s.cancelRequestAndRemoveFromPending(req.frontendAddress, req.queryID)
   405  
   406  	// Handle the stream sending & receiving on a goroutine so we can
   407  	// monitoring the contexts in a select and cancel things appropriately.
   408  	errCh := make(chan error, 1)
   409  	go func() {
   410  		err := querier.Send(&schedulerpb.SchedulerToQuerier{
   411  			UserID:          req.userID,
   412  			QueryID:         req.queryID,
   413  			FrontendAddress: req.frontendAddress,
   414  			HttpRequest:     req.request,
   415  			StatsEnabled:    req.statsEnabled,
   416  		})
   417  		if err != nil {
   418  			errCh <- err
   419  			return
   420  		}
   421  
   422  		_, err = querier.Recv()
   423  		errCh <- err
   424  	}()
   425  
   426  	select {
   427  	case <-req.ctx.Done():
   428  		// If the upstream request is cancelled (eg. frontend issued CANCEL or closed connection),
   429  		// we need to cancel the downstream req. Only way we can do that is to close the stream (by returning error here).
   430  		// Querier is expecting this semantics.
   431  		return req.ctx.Err()
   432  
   433  	case err := <-errCh:
   434  		// Is there was an error handling this request due to network IO,
   435  		// then error out this upstream request _and_ stream.
   436  
   437  		if err != nil {
   438  			s.forwardErrorToFrontend(req.ctx, req, err)
   439  		}
   440  		return err
   441  	}
   442  }
   443  
   444  func (s *Scheduler) forwardErrorToFrontend(ctx context.Context, req *schedulerRequest, requestErr error) {
   445  	opts, err := s.cfg.GRPCClientConfig.DialOption([]grpc.UnaryClientInterceptor{
   446  		otgrpc.OpenTracingClientInterceptor(opentracing.GlobalTracer()),
   447  		middleware.ClientUserHeaderInterceptor},
   448  		nil)
   449  	if err != nil {
   450  		level.Warn(s.log).Log("msg", "failed to create gRPC options for the connection to frontend to report error", "frontend", req.frontendAddress, "err", err, "requestErr", requestErr)
   451  		return
   452  	}
   453  
   454  	conn, err := grpc.DialContext(ctx, req.frontendAddress, opts...)
   455  	if err != nil {
   456  		level.Warn(s.log).Log("msg", "failed to create gRPC connection to frontend to report error", "frontend", req.frontendAddress, "err", err, "requestErr", requestErr)
   457  		return
   458  	}
   459  
   460  	defer func() {
   461  		_ = conn.Close()
   462  	}()
   463  
   464  	client := frontendv2pb.NewFrontendForQuerierClient(conn)
   465  
   466  	userCtx := user.InjectOrgID(ctx, req.userID)
   467  	_, err = client.QueryResult(userCtx, &frontendv2pb.QueryResultRequest{
   468  		QueryID: req.queryID,
   469  		HttpResponse: &httpgrpc.HTTPResponse{
   470  			Code: http.StatusInternalServerError,
   471  			Body: []byte(requestErr.Error()),
   472  		},
   473  	})
   474  
   475  	if err != nil {
   476  		level.Warn(s.log).Log("msg", "failed to forward error to frontend", "frontend", req.frontendAddress, "err", err, "requestErr", requestErr)
   477  		return
   478  	}
   479  }
   480  
   481  func (s *Scheduler) isRunningOrStopping() bool {
   482  	st := s.State()
   483  	return st == services.Running || st == services.Stopping
   484  }
   485  
   486  func (s *Scheduler) starting(ctx context.Context) error {
   487  	s.subservicesWatcher.WatchManager(s.subservices)
   488  
   489  	if err := services.StartManagerAndAwaitHealthy(ctx, s.subservices); err != nil {
   490  		return errors.Wrap(err, "unable to start scheduler subservices")
   491  	}
   492  
   493  	return nil
   494  }
   495  
   496  func (s *Scheduler) running(ctx context.Context) error {
   497  	for {
   498  		select {
   499  		case <-ctx.Done():
   500  			return nil
   501  		case err := <-s.subservicesWatcher.Chan():
   502  			return errors.Wrap(err, "scheduler subservice failed")
   503  		}
   504  	}
   505  }
   506  
   507  // Close the Scheduler.
   508  func (s *Scheduler) stopping(_ error) error {
   509  	// This will also stop the requests queue, which stop accepting new requests and errors out any pending requests.
   510  	return services.StopManagerAndAwaitStopped(context.Background(), s.subservices)
   511  }
   512  
   513  func (s *Scheduler) cleanupMetricsForInactiveUser(user string) {
   514  	s.queueLength.DeleteLabelValues(user)
   515  	s.discardedRequests.DeleteLabelValues(user)
   516  }
   517  
   518  func (s *Scheduler) getConnectedFrontendClientsMetric() float64 {
   519  	s.connectedFrontendsMu.Lock()
   520  	defer s.connectedFrontendsMu.Unlock()
   521  
   522  	count := 0
   523  	for _, workers := range s.connectedFrontends {
   524  		count += workers.connections
   525  	}
   526  
   527  	return float64(count)
   528  }