github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/scheduler/scheduler.go (about) 1 package scheduler 2 3 import ( 4 "context" 5 "flag" 6 "io" 7 "net/http" 8 "sync" 9 "time" 10 11 "github.com/go-kit/log" 12 "github.com/go-kit/log/level" 13 "github.com/grafana/dskit/grpcclient" 14 "github.com/grafana/dskit/services" 15 otgrpc "github.com/opentracing-contrib/go-grpc" 16 "github.com/opentracing/opentracing-go" 17 "github.com/pkg/errors" 18 "github.com/prometheus/client_golang/prometheus" 19 "github.com/prometheus/client_golang/prometheus/promauto" 20 "github.com/weaveworks/common/httpgrpc" 21 "github.com/weaveworks/common/middleware" 22 "github.com/weaveworks/common/user" 23 "google.golang.org/grpc" 24 25 "github.com/cortexproject/cortex/pkg/frontend/v2/frontendv2pb" 26 "github.com/cortexproject/cortex/pkg/scheduler/queue" 27 "github.com/cortexproject/cortex/pkg/scheduler/schedulerpb" 28 "github.com/cortexproject/cortex/pkg/tenant" 29 "github.com/cortexproject/cortex/pkg/util" 30 "github.com/cortexproject/cortex/pkg/util/httpgrpcutil" 31 "github.com/cortexproject/cortex/pkg/util/validation" 32 ) 33 34 var ( 35 errSchedulerIsNotRunning = errors.New("scheduler is not running") 36 ) 37 38 // Scheduler is responsible for queueing and dispatching queries to Queriers. 39 type Scheduler struct { 40 services.Service 41 42 cfg Config 43 log log.Logger 44 45 limits Limits 46 47 connectedFrontendsMu sync.Mutex 48 connectedFrontends map[string]*connectedFrontend 49 50 requestQueue *queue.RequestQueue 51 activeUsers *util.ActiveUsersCleanupService 52 53 pendingRequestsMu sync.Mutex 54 pendingRequests map[requestKey]*schedulerRequest // Request is kept in this map even after being dispatched to querier. It can still be canceled at that time. 55 56 // Subservices manager. 57 subservices *services.Manager 58 subservicesWatcher *services.FailureWatcher 59 60 // Metrics. 61 queueLength *prometheus.GaugeVec 62 discardedRequests *prometheus.CounterVec 63 connectedQuerierClients prometheus.GaugeFunc 64 connectedFrontendClients prometheus.GaugeFunc 65 queueDuration prometheus.Histogram 66 } 67 68 type requestKey struct { 69 frontendAddr string 70 queryID uint64 71 } 72 73 type connectedFrontend struct { 74 connections int 75 76 // This context is used for running all queries from the same frontend. 77 // When last frontend connection is closed, context is canceled. 78 ctx context.Context 79 cancel context.CancelFunc 80 } 81 82 type Config struct { 83 MaxOutstandingPerTenant int `yaml:"max_outstanding_requests_per_tenant"` 84 QuerierForgetDelay time.Duration `yaml:"querier_forget_delay"` 85 GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=This configures the gRPC client used to report errors back to the query-frontend."` 86 } 87 88 func (cfg *Config) RegisterFlags(f *flag.FlagSet) { 89 f.IntVar(&cfg.MaxOutstandingPerTenant, "query-scheduler.max-outstanding-requests-per-tenant", 100, "Maximum number of outstanding requests per tenant per query-scheduler. In-flight requests above this limit will fail with HTTP response status code 429.") 90 f.DurationVar(&cfg.QuerierForgetDelay, "query-scheduler.querier-forget-delay", 0, "If a querier disconnects without sending notification about graceful shutdown, the query-scheduler will keep the querier in the tenant's shard until the forget delay has passed. This feature is useful to reduce the blast radius when shuffle-sharding is enabled.") 91 cfg.GRPCClientConfig.RegisterFlagsWithPrefix("query-scheduler.grpc-client-config", f) 92 } 93 94 // NewScheduler creates a new Scheduler. 95 func NewScheduler(cfg Config, limits Limits, log log.Logger, registerer prometheus.Registerer) (*Scheduler, error) { 96 s := &Scheduler{ 97 cfg: cfg, 98 log: log, 99 limits: limits, 100 101 pendingRequests: map[requestKey]*schedulerRequest{}, 102 connectedFrontends: map[string]*connectedFrontend{}, 103 } 104 105 s.queueLength = promauto.With(registerer).NewGaugeVec(prometheus.GaugeOpts{ 106 Name: "cortex_query_scheduler_queue_length", 107 Help: "Number of queries in the queue.", 108 }, []string{"user"}) 109 110 s.discardedRequests = promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{ 111 Name: "cortex_query_scheduler_discarded_requests_total", 112 Help: "Total number of query requests discarded.", 113 }, []string{"user"}) 114 s.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, cfg.QuerierForgetDelay, s.queueLength, s.discardedRequests) 115 116 s.queueDuration = promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{ 117 Name: "cortex_query_scheduler_queue_duration_seconds", 118 Help: "Time spend by requests in queue before getting picked up by a querier.", 119 Buckets: prometheus.DefBuckets, 120 }) 121 s.connectedQuerierClients = promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{ 122 Name: "cortex_query_scheduler_connected_querier_clients", 123 Help: "Number of querier worker clients currently connected to the query-scheduler.", 124 }, s.requestQueue.GetConnectedQuerierWorkersMetric) 125 s.connectedFrontendClients = promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{ 126 Name: "cortex_query_scheduler_connected_frontend_clients", 127 Help: "Number of query-frontend worker clients currently connected to the query-scheduler.", 128 }, s.getConnectedFrontendClientsMetric) 129 130 s.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(s.cleanupMetricsForInactiveUser) 131 132 var err error 133 s.subservices, err = services.NewManager(s.requestQueue, s.activeUsers) 134 if err != nil { 135 return nil, err 136 } 137 138 s.Service = services.NewBasicService(s.starting, s.running, s.stopping) 139 return s, nil 140 } 141 142 // Limits needed for the Query Scheduler - interface used for decoupling. 143 type Limits interface { 144 // MaxQueriersPerUser returns max queriers to use per tenant, or 0 if shuffle sharding is disabled. 145 MaxQueriersPerUser(user string) int 146 } 147 148 type schedulerRequest struct { 149 frontendAddress string 150 userID string 151 queryID uint64 152 request *httpgrpc.HTTPRequest 153 statsEnabled bool 154 155 enqueueTime time.Time 156 157 ctx context.Context 158 ctxCancel context.CancelFunc 159 queueSpan opentracing.Span 160 161 // This is only used for testing. 162 parentSpanContext opentracing.SpanContext 163 } 164 165 // FrontendLoop handles connection from frontend. 166 func (s *Scheduler) FrontendLoop(frontend schedulerpb.SchedulerForFrontend_FrontendLoopServer) error { 167 frontendAddress, frontendCtx, err := s.frontendConnected(frontend) 168 if err != nil { 169 return err 170 } 171 defer s.frontendDisconnected(frontendAddress) 172 173 // Response to INIT. If scheduler is not running, we skip for-loop, send SHUTTING_DOWN and exit this method. 174 if s.State() == services.Running { 175 if err := frontend.Send(&schedulerpb.SchedulerToFrontend{Status: schedulerpb.OK}); err != nil { 176 return err 177 } 178 } 179 180 // We stop accepting new queries in Stopping state. By returning quickly, we disconnect frontends, which in turns 181 // cancels all their queries. 182 for s.State() == services.Running { 183 msg, err := frontend.Recv() 184 if err != nil { 185 // No need to report this as error, it is expected when query-frontend performs SendClose() (as frontendSchedulerWorker does). 186 if err == io.EOF { 187 return nil 188 } 189 return err 190 } 191 192 if s.State() != services.Running { 193 break // break out of the loop, and send SHUTTING_DOWN message. 194 } 195 196 var resp *schedulerpb.SchedulerToFrontend 197 198 switch msg.GetType() { 199 case schedulerpb.ENQUEUE: 200 err = s.enqueueRequest(frontendCtx, frontendAddress, msg) 201 switch { 202 case err == nil: 203 resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.OK} 204 case err == queue.ErrTooManyRequests: 205 resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.TOO_MANY_REQUESTS_PER_TENANT} 206 default: 207 resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.ERROR, Error: err.Error()} 208 } 209 210 case schedulerpb.CANCEL: 211 s.cancelRequestAndRemoveFromPending(frontendAddress, msg.QueryID) 212 resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.OK} 213 214 default: 215 level.Error(s.log).Log("msg", "unknown request type from frontend", "addr", frontendAddress, "type", msg.GetType()) 216 return errors.New("unknown request type") 217 } 218 219 err = frontend.Send(resp) 220 // Failure to send response results in ending this connection. 221 if err != nil { 222 return err 223 } 224 } 225 226 // Report shutdown back to frontend, so that it can retry with different scheduler. Also stop the frontend loop. 227 return frontend.Send(&schedulerpb.SchedulerToFrontend{Status: schedulerpb.SHUTTING_DOWN}) 228 } 229 230 func (s *Scheduler) frontendConnected(frontend schedulerpb.SchedulerForFrontend_FrontendLoopServer) (string, context.Context, error) { 231 msg, err := frontend.Recv() 232 if err != nil { 233 return "", nil, err 234 } 235 if msg.Type != schedulerpb.INIT || msg.FrontendAddress == "" { 236 return "", nil, errors.New("no frontend address") 237 } 238 239 s.connectedFrontendsMu.Lock() 240 defer s.connectedFrontendsMu.Unlock() 241 242 cf := s.connectedFrontends[msg.FrontendAddress] 243 if cf == nil { 244 cf = &connectedFrontend{ 245 connections: 0, 246 } 247 cf.ctx, cf.cancel = context.WithCancel(context.Background()) 248 s.connectedFrontends[msg.FrontendAddress] = cf 249 } 250 251 cf.connections++ 252 return msg.FrontendAddress, cf.ctx, nil 253 } 254 255 func (s *Scheduler) frontendDisconnected(frontendAddress string) { 256 s.connectedFrontendsMu.Lock() 257 defer s.connectedFrontendsMu.Unlock() 258 259 cf := s.connectedFrontends[frontendAddress] 260 cf.connections-- 261 if cf.connections == 0 { 262 delete(s.connectedFrontends, frontendAddress) 263 cf.cancel() 264 } 265 } 266 267 func (s *Scheduler) enqueueRequest(frontendContext context.Context, frontendAddr string, msg *schedulerpb.FrontendToScheduler) error { 268 // Create new context for this request, to support cancellation. 269 ctx, cancel := context.WithCancel(frontendContext) 270 shouldCancel := true 271 defer func() { 272 if shouldCancel { 273 cancel() 274 } 275 }() 276 277 // Extract tracing information from headers in HTTP request. FrontendContext doesn't have the correct tracing 278 // information, since that is a long-running request. 279 tracer := opentracing.GlobalTracer() 280 parentSpanContext, err := httpgrpcutil.GetParentSpanForRequest(tracer, msg.HttpRequest) 281 if err != nil { 282 return err 283 } 284 285 userID := msg.GetUserID() 286 287 req := &schedulerRequest{ 288 frontendAddress: frontendAddr, 289 userID: msg.UserID, 290 queryID: msg.QueryID, 291 request: msg.HttpRequest, 292 statsEnabled: msg.StatsEnabled, 293 } 294 295 now := time.Now() 296 297 req.parentSpanContext = parentSpanContext 298 req.queueSpan, req.ctx = opentracing.StartSpanFromContextWithTracer(ctx, tracer, "queued", opentracing.ChildOf(parentSpanContext)) 299 req.enqueueTime = now 300 req.ctxCancel = cancel 301 302 // aggregate the max queriers limit in the case of a multi tenant query 303 tenantIDs, err := tenant.TenantIDsFromOrgID(userID) 304 if err != nil { 305 return err 306 } 307 maxQueriers := validation.SmallestPositiveNonZeroIntPerTenant(tenantIDs, s.limits.MaxQueriersPerUser) 308 309 s.activeUsers.UpdateUserTimestamp(userID, now) 310 return s.requestQueue.EnqueueRequest(userID, req, maxQueriers, func() { 311 shouldCancel = false 312 313 s.pendingRequestsMu.Lock() 314 defer s.pendingRequestsMu.Unlock() 315 s.pendingRequests[requestKey{frontendAddr: frontendAddr, queryID: msg.QueryID}] = req 316 }) 317 } 318 319 // This method doesn't do removal from the queue. 320 func (s *Scheduler) cancelRequestAndRemoveFromPending(frontendAddr string, queryID uint64) { 321 s.pendingRequestsMu.Lock() 322 defer s.pendingRequestsMu.Unlock() 323 324 key := requestKey{frontendAddr: frontendAddr, queryID: queryID} 325 req := s.pendingRequests[key] 326 if req != nil { 327 req.ctxCancel() 328 } 329 delete(s.pendingRequests, key) 330 } 331 332 // QuerierLoop is started by querier to receive queries from scheduler. 333 func (s *Scheduler) QuerierLoop(querier schedulerpb.SchedulerForQuerier_QuerierLoopServer) error { 334 resp, err := querier.Recv() 335 if err != nil { 336 return err 337 } 338 339 querierID := resp.GetQuerierID() 340 341 s.requestQueue.RegisterQuerierConnection(querierID) 342 defer s.requestQueue.UnregisterQuerierConnection(querierID) 343 344 // If the downstream connection to querier is cancelled, 345 // we need to ping the condition variable to unblock getNextRequestForQuerier. 346 // Ideally we'd have ctx aware condition variables... 347 go func() { 348 <-querier.Context().Done() 349 s.requestQueue.QuerierDisconnecting() 350 }() 351 352 lastUserIndex := queue.FirstUser() 353 354 // In stopping state scheduler is not accepting new queries, but still dispatching queries in the queues. 355 for s.isRunningOrStopping() { 356 req, idx, err := s.requestQueue.GetNextRequestForQuerier(querier.Context(), lastUserIndex, querierID) 357 if err != nil { 358 return err 359 } 360 lastUserIndex = idx 361 362 r := req.(*schedulerRequest) 363 364 s.queueDuration.Observe(time.Since(r.enqueueTime).Seconds()) 365 r.queueSpan.Finish() 366 367 /* 368 We want to dequeue the next unexpired request from the chosen tenant queue. 369 The chance of choosing a particular tenant for dequeueing is (1/active_tenants). 370 This is problematic under load, especially with other middleware enabled such as 371 querier.split-by-interval, where one request may fan out into many. 372 If expired requests aren't exhausted before checking another tenant, it would take 373 n_active_tenants * n_expired_requests_at_front_of_queue requests being processed 374 before an active request was handled for the tenant in question. 375 If this tenant meanwhile continued to queue requests, 376 it's possible that it's own queue would perpetually contain only expired requests. 377 */ 378 379 if r.ctx.Err() != nil { 380 // Remove from pending requests. 381 s.cancelRequestAndRemoveFromPending(r.frontendAddress, r.queryID) 382 383 lastUserIndex = lastUserIndex.ReuseLastUser() 384 continue 385 } 386 387 if err := s.forwardRequestToQuerier(querier, r); err != nil { 388 return err 389 } 390 } 391 392 return errSchedulerIsNotRunning 393 } 394 395 func (s *Scheduler) NotifyQuerierShutdown(_ context.Context, req *schedulerpb.NotifyQuerierShutdownRequest) (*schedulerpb.NotifyQuerierShutdownResponse, error) { 396 level.Info(s.log).Log("msg", "received shutdown notification from querier", "querier", req.GetQuerierID()) 397 s.requestQueue.NotifyQuerierShutdown(req.GetQuerierID()) 398 399 return &schedulerpb.NotifyQuerierShutdownResponse{}, nil 400 } 401 402 func (s *Scheduler) forwardRequestToQuerier(querier schedulerpb.SchedulerForQuerier_QuerierLoopServer, req *schedulerRequest) error { 403 // Make sure to cancel request at the end to cleanup resources. 404 defer s.cancelRequestAndRemoveFromPending(req.frontendAddress, req.queryID) 405 406 // Handle the stream sending & receiving on a goroutine so we can 407 // monitoring the contexts in a select and cancel things appropriately. 408 errCh := make(chan error, 1) 409 go func() { 410 err := querier.Send(&schedulerpb.SchedulerToQuerier{ 411 UserID: req.userID, 412 QueryID: req.queryID, 413 FrontendAddress: req.frontendAddress, 414 HttpRequest: req.request, 415 StatsEnabled: req.statsEnabled, 416 }) 417 if err != nil { 418 errCh <- err 419 return 420 } 421 422 _, err = querier.Recv() 423 errCh <- err 424 }() 425 426 select { 427 case <-req.ctx.Done(): 428 // If the upstream request is cancelled (eg. frontend issued CANCEL or closed connection), 429 // we need to cancel the downstream req. Only way we can do that is to close the stream (by returning error here). 430 // Querier is expecting this semantics. 431 return req.ctx.Err() 432 433 case err := <-errCh: 434 // Is there was an error handling this request due to network IO, 435 // then error out this upstream request _and_ stream. 436 437 if err != nil { 438 s.forwardErrorToFrontend(req.ctx, req, err) 439 } 440 return err 441 } 442 } 443 444 func (s *Scheduler) forwardErrorToFrontend(ctx context.Context, req *schedulerRequest, requestErr error) { 445 opts, err := s.cfg.GRPCClientConfig.DialOption([]grpc.UnaryClientInterceptor{ 446 otgrpc.OpenTracingClientInterceptor(opentracing.GlobalTracer()), 447 middleware.ClientUserHeaderInterceptor}, 448 nil) 449 if err != nil { 450 level.Warn(s.log).Log("msg", "failed to create gRPC options for the connection to frontend to report error", "frontend", req.frontendAddress, "err", err, "requestErr", requestErr) 451 return 452 } 453 454 conn, err := grpc.DialContext(ctx, req.frontendAddress, opts...) 455 if err != nil { 456 level.Warn(s.log).Log("msg", "failed to create gRPC connection to frontend to report error", "frontend", req.frontendAddress, "err", err, "requestErr", requestErr) 457 return 458 } 459 460 defer func() { 461 _ = conn.Close() 462 }() 463 464 client := frontendv2pb.NewFrontendForQuerierClient(conn) 465 466 userCtx := user.InjectOrgID(ctx, req.userID) 467 _, err = client.QueryResult(userCtx, &frontendv2pb.QueryResultRequest{ 468 QueryID: req.queryID, 469 HttpResponse: &httpgrpc.HTTPResponse{ 470 Code: http.StatusInternalServerError, 471 Body: []byte(requestErr.Error()), 472 }, 473 }) 474 475 if err != nil { 476 level.Warn(s.log).Log("msg", "failed to forward error to frontend", "frontend", req.frontendAddress, "err", err, "requestErr", requestErr) 477 return 478 } 479 } 480 481 func (s *Scheduler) isRunningOrStopping() bool { 482 st := s.State() 483 return st == services.Running || st == services.Stopping 484 } 485 486 func (s *Scheduler) starting(ctx context.Context) error { 487 s.subservicesWatcher.WatchManager(s.subservices) 488 489 if err := services.StartManagerAndAwaitHealthy(ctx, s.subservices); err != nil { 490 return errors.Wrap(err, "unable to start scheduler subservices") 491 } 492 493 return nil 494 } 495 496 func (s *Scheduler) running(ctx context.Context) error { 497 for { 498 select { 499 case <-ctx.Done(): 500 return nil 501 case err := <-s.subservicesWatcher.Chan(): 502 return errors.Wrap(err, "scheduler subservice failed") 503 } 504 } 505 } 506 507 // Close the Scheduler. 508 func (s *Scheduler) stopping(_ error) error { 509 // This will also stop the requests queue, which stop accepting new requests and errors out any pending requests. 510 return services.StopManagerAndAwaitStopped(context.Background(), s.subservices) 511 } 512 513 func (s *Scheduler) cleanupMetricsForInactiveUser(user string) { 514 s.queueLength.DeleteLabelValues(user) 515 s.discardedRequests.DeleteLabelValues(user) 516 } 517 518 func (s *Scheduler) getConnectedFrontendClientsMetric() float64 { 519 s.connectedFrontendsMu.Lock() 520 defer s.connectedFrontendsMu.Unlock() 521 522 count := 0 523 for _, workers := range s.connectedFrontends { 524 count += workers.connections 525 } 526 527 return float64(count) 528 }