github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/scheduler/scheduler.go (about) 1 package scheduler 2 3 import ( 4 "context" 5 "flag" 6 "io" 7 "net/http" 8 "net/textproto" 9 "sync" 10 "time" 11 12 "github.com/go-kit/log" 13 "github.com/go-kit/log/level" 14 "github.com/grafana/dskit/grpcclient" 15 "github.com/grafana/dskit/kv" 16 "github.com/grafana/dskit/ring" 17 "github.com/grafana/dskit/services" 18 otgrpc "github.com/opentracing-contrib/go-grpc" 19 "github.com/opentracing/opentracing-go" 20 "github.com/pkg/errors" 21 "github.com/prometheus/client_golang/prometheus" 22 "github.com/prometheus/client_golang/prometheus/promauto" 23 "github.com/weaveworks/common/httpgrpc" 24 "github.com/weaveworks/common/middleware" 25 "github.com/weaveworks/common/user" 26 "go.uber.org/atomic" 27 "google.golang.org/grpc" 28 29 "github.com/grafana/dskit/tenant" 30 31 "github.com/grafana/loki/pkg/lokifrontend/frontend/v2/frontendv2pb" 32 "github.com/grafana/loki/pkg/scheduler/queue" 33 "github.com/grafana/loki/pkg/scheduler/schedulerpb" 34 "github.com/grafana/loki/pkg/util" 35 lokiutil "github.com/grafana/loki/pkg/util" 36 lokigrpc "github.com/grafana/loki/pkg/util/httpgrpc" 37 lokihttpreq "github.com/grafana/loki/pkg/util/httpreq" 38 util_log "github.com/grafana/loki/pkg/util/log" 39 "github.com/grafana/loki/pkg/util/validation" 40 ) 41 42 var errSchedulerIsNotRunning = errors.New("scheduler is not running") 43 44 const ( 45 // ringAutoForgetUnhealthyPeriods is how many consecutive timeout periods an unhealthy instance 46 // in the ring will be automatically removed. 47 ringAutoForgetUnhealthyPeriods = 10 48 49 // ringKey is the key under which we store the store gateways ring in the KVStore. 50 ringKey = "scheduler" 51 52 // ringNameForServer is the name of the ring used by the compactor server. 53 ringNameForServer = "scheduler" 54 55 // ringReplicationFactor should be 2 because we want 2 schedulers. 56 ringReplicationFactor = 2 57 58 // ringNumTokens sets our single token in the ring, 59 // we only need to insert 1 token to be used for leader election purposes. 60 ringNumTokens = 1 61 62 // ringCheckPeriod is how often we check the ring to see if this instance is still in 63 // the replicaset of instances to act as schedulers. 64 ringCheckPeriod = 3 * time.Second 65 ) 66 67 // Scheduler is responsible for queueing and dispatching queries to Queriers. 68 type Scheduler struct { 69 services.Service 70 71 cfg Config 72 log log.Logger 73 74 limits Limits 75 76 connectedFrontendsMu sync.Mutex 77 connectedFrontends map[string]*connectedFrontend 78 79 requestQueue *queue.RequestQueue 80 activeUsers *util.ActiveUsersCleanupService 81 82 pendingRequestsMu sync.Mutex 83 pendingRequests map[requestKey]*schedulerRequest // Request is kept in this map even after being dispatched to querier. It can still be canceled at that time. 84 85 // Subservices manager. 86 subservices *services.Manager 87 subservicesWatcher *services.FailureWatcher 88 89 // Metrics. 90 queueLength *prometheus.GaugeVec 91 discardedRequests *prometheus.CounterVec 92 connectedQuerierClients prometheus.GaugeFunc 93 connectedFrontendClients prometheus.GaugeFunc 94 queueDuration prometheus.Histogram 95 schedulerRunning prometheus.Gauge 96 inflightRequests prometheus.Summary 97 98 // Ring used for finding schedulers 99 ringLifecycler *ring.BasicLifecycler 100 ring *ring.Ring 101 102 // Controls for this being a chosen scheduler 103 shouldRun atomic.Bool 104 } 105 106 type requestKey struct { 107 frontendAddr string 108 queryID uint64 109 } 110 111 type connectedFrontend struct { 112 connections int 113 frontend schedulerpb.SchedulerForFrontend_FrontendLoopServer 114 115 // This context is used for running all queries from the same frontend. 116 // When last frontend connection is closed, context is canceled. 117 ctx context.Context 118 cancel context.CancelFunc 119 } 120 121 type Config struct { 122 MaxOutstandingPerTenant int `yaml:"max_outstanding_requests_per_tenant"` 123 QuerierForgetDelay time.Duration `yaml:"querier_forget_delay"` 124 GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=This configures the gRPC client used to report errors back to the query-frontend."` 125 // Schedulers ring 126 UseSchedulerRing bool `yaml:"use_scheduler_ring"` 127 SchedulerRing lokiutil.RingConfig `yaml:"scheduler_ring,omitempty"` 128 } 129 130 func (cfg *Config) RegisterFlags(f *flag.FlagSet) { 131 f.IntVar(&cfg.MaxOutstandingPerTenant, "query-scheduler.max-outstanding-requests-per-tenant", 100, "Maximum number of outstanding requests per tenant per query scheduler. In-flight requests above this limit will fail with HTTP response status code 429.") 132 f.DurationVar(&cfg.QuerierForgetDelay, "query-scheduler.querier-forget-delay", 0, "If a querier disconnects without sending notification about graceful shutdown, the query-scheduler will keep the querier in the tenant's shard until the forget delay has passed. This feature is useful to reduce the blast radius when shuffle-sharding is enabled.") 133 cfg.GRPCClientConfig.RegisterFlagsWithPrefix("query-scheduler.grpc-client-config", f) 134 f.BoolVar(&cfg.UseSchedulerRing, "query-scheduler.use-scheduler-ring", false, "Set to true to have the query scheduler create a ring and the frontend and frontend_worker use this ring to get the addresses of the query schedulers. If frontend_address and scheduler_address are not present in the config this value will be toggle by Loki to true") 135 cfg.SchedulerRing.RegisterFlagsWithPrefix("query-scheduler.", "collectors/", f) 136 } 137 138 // NewScheduler creates a new Scheduler. 139 func NewScheduler(cfg Config, limits Limits, log log.Logger, registerer prometheus.Registerer) (*Scheduler, error) { 140 s := &Scheduler{ 141 cfg: cfg, 142 log: log, 143 limits: limits, 144 145 pendingRequests: map[requestKey]*schedulerRequest{}, 146 connectedFrontends: map[string]*connectedFrontend{}, 147 } 148 149 s.queueLength = promauto.With(registerer).NewGaugeVec(prometheus.GaugeOpts{ 150 Name: "cortex_query_scheduler_queue_length", 151 Help: "Number of queries in the queue.", 152 }, []string{"user"}) 153 154 s.discardedRequests = promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{ 155 Name: "cortex_query_scheduler_discarded_requests_total", 156 Help: "Total number of query requests discarded.", 157 }, []string{"user"}) 158 s.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, cfg.QuerierForgetDelay, s.queueLength, s.discardedRequests) 159 160 s.queueDuration = promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{ 161 Name: "cortex_query_scheduler_queue_duration_seconds", 162 Help: "Time spend by requests in queue before getting picked up by a querier.", 163 Buckets: prometheus.DefBuckets, 164 }) 165 s.connectedQuerierClients = promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{ 166 Name: "cortex_query_scheduler_connected_querier_clients", 167 Help: "Number of querier worker clients currently connected to the query-scheduler.", 168 }, s.requestQueue.GetConnectedQuerierWorkersMetric) 169 s.connectedFrontendClients = promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{ 170 Name: "cortex_query_scheduler_connected_frontend_clients", 171 Help: "Number of query-frontend worker clients currently connected to the query-scheduler.", 172 }, s.getConnectedFrontendClientsMetric) 173 s.schedulerRunning = promauto.With(registerer).NewGauge(prometheus.GaugeOpts{ 174 Name: "cortex_query_scheduler_running", 175 Help: "Value will be 1 if the scheduler is in the ReplicationSet and actively receiving/processing requests", 176 }) 177 s.inflightRequests = promauto.With(registerer).NewSummary(prometheus.SummaryOpts{ 178 Name: "cortex_query_scheduler_inflight_requests", 179 Help: "Number of inflight requests (either queued or processing) sampled at a regular interval. Quantile buckets keep track of inflight requests over the last 60s.", 180 Objectives: map[float64]float64{0.5: 0.05, 0.75: 0.02, 0.8: 0.02, 0.9: 0.01, 0.95: 0.01, 0.99: 0.001}, 181 MaxAge: time.Minute, 182 AgeBuckets: 6, 183 }) 184 185 s.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(s.cleanupMetricsForInactiveUser) 186 187 svcs := []services.Service{s.requestQueue, s.activeUsers} 188 189 if cfg.UseSchedulerRing { 190 s.shouldRun.Store(false) 191 ringStore, err := kv.NewClient( 192 cfg.SchedulerRing.KVStore, 193 ring.GetCodec(), 194 kv.RegistererWithKVName(prometheus.WrapRegistererWithPrefix("loki_", registerer), "scheduler"), 195 log, 196 ) 197 if err != nil { 198 return nil, errors.Wrap(err, "create KV store client") 199 } 200 lifecyclerCfg, err := cfg.SchedulerRing.ToLifecyclerConfig(ringNumTokens, log) 201 if err != nil { 202 return nil, errors.Wrap(err, "invalid ring lifecycler config") 203 } 204 205 // Define lifecycler delegates in reverse order (last to be called defined first because they're 206 // chained via "next delegate"). 207 delegate := ring.BasicLifecyclerDelegate(s) 208 delegate = ring.NewLeaveOnStoppingDelegate(delegate, log) 209 delegate = ring.NewTokensPersistencyDelegate(cfg.SchedulerRing.TokensFilePath, ring.JOINING, delegate, log) 210 delegate = ring.NewAutoForgetDelegate(ringAutoForgetUnhealthyPeriods*cfg.SchedulerRing.HeartbeatTimeout, delegate, log) 211 212 s.ringLifecycler, err = ring.NewBasicLifecycler(lifecyclerCfg, ringNameForServer, ringKey, ringStore, delegate, log, registerer) 213 if err != nil { 214 return nil, errors.Wrap(err, "create ring lifecycler") 215 } 216 217 ringCfg := cfg.SchedulerRing.ToRingConfig(ringReplicationFactor) 218 s.ring, err = ring.NewWithStoreClientAndStrategy(ringCfg, ringNameForServer, ringKey, ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), prometheus.WrapRegistererWithPrefix("cortex_", registerer), util_log.Logger) 219 if err != nil { 220 return nil, errors.Wrap(err, "create ring client") 221 } 222 223 svcs = append(svcs, s.ringLifecycler, s.ring) 224 } else { 225 // Always run if no scheduler ring is being used. 226 s.shouldRun.Store(true) 227 } 228 229 var err error 230 s.subservices, err = services.NewManager(svcs...) 231 if err != nil { 232 return nil, err 233 } 234 s.subservicesWatcher = services.NewFailureWatcher() 235 s.subservicesWatcher.WatchManager(s.subservices) 236 237 s.Service = services.NewBasicService(s.starting, s.running, s.stopping) 238 return s, nil 239 } 240 241 // Limits needed for the Query Scheduler - interface used for decoupling. 242 type Limits interface { 243 // MaxQueriersPerUser returns max queriers to use per tenant, or 0 if shuffle sharding is disabled. 244 MaxQueriersPerUser(user string) int 245 } 246 247 type schedulerRequest struct { 248 frontendAddress string 249 userID string 250 queryID uint64 251 request *httpgrpc.HTTPRequest 252 statsEnabled bool 253 254 queueTime time.Time 255 256 ctx context.Context 257 ctxCancel context.CancelFunc 258 queueSpan opentracing.Span 259 260 // This is only used for testing. 261 parentSpanContext opentracing.SpanContext 262 } 263 264 // FrontendLoop handles connection from frontend. 265 func (s *Scheduler) FrontendLoop(frontend schedulerpb.SchedulerForFrontend_FrontendLoopServer) error { 266 frontendAddress, frontendCtx, err := s.frontendConnected(frontend) 267 if err != nil { 268 return err 269 } 270 defer s.frontendDisconnected(frontendAddress) 271 272 // Response to INIT. If scheduler is not running, we skip for-loop, send SHUTTING_DOWN and exit this method. 273 if s.State() == services.Running && s.shouldRun.Load() { 274 if err := frontend.Send(&schedulerpb.SchedulerToFrontend{Status: schedulerpb.OK}); err != nil { 275 return err 276 } 277 } 278 279 // We stop accepting new queries in Stopping state. By returning quickly, we disconnect frontends, which in turns 280 // cancels all their queries. 281 for s.State() == services.Running { 282 msg, err := frontend.Recv() 283 if err != nil { 284 // No need to report this as error, it is expected when query-frontend performs SendClose() (as frontendSchedulerWorker does). 285 if err == io.EOF { 286 return nil 287 } 288 return err 289 } 290 291 if s.State() != services.Running { 292 break // break out of the loop, and send SHUTTING_DOWN message. 293 } 294 295 var resp *schedulerpb.SchedulerToFrontend 296 297 switch msg.GetType() { 298 case schedulerpb.ENQUEUE: 299 err = s.enqueueRequest(frontendCtx, frontendAddress, msg) 300 switch { 301 case err == nil: 302 resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.OK} 303 case err == queue.ErrTooManyRequests: 304 resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.TOO_MANY_REQUESTS_PER_TENANT} 305 default: 306 resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.ERROR, Error: err.Error()} 307 } 308 309 case schedulerpb.CANCEL: 310 s.cancelRequestAndRemoveFromPending(frontendAddress, msg.QueryID) 311 resp = &schedulerpb.SchedulerToFrontend{Status: schedulerpb.OK} 312 313 default: 314 level.Error(s.log).Log("msg", "unknown request type from frontend", "addr", frontendAddress, "type", msg.GetType()) 315 return errors.New("unknown request type") 316 } 317 318 err = frontend.Send(resp) 319 // Failure to send response results in ending this connection. 320 if err != nil { 321 return err 322 } 323 } 324 325 // Report shutdown back to frontend, so that it can retry with different scheduler. Also stop the frontend loop. 326 return frontend.Send(&schedulerpb.SchedulerToFrontend{Status: schedulerpb.SHUTTING_DOWN}) 327 } 328 329 func (s *Scheduler) frontendConnected(frontend schedulerpb.SchedulerForFrontend_FrontendLoopServer) (string, context.Context, error) { 330 msg, err := frontend.Recv() 331 if err != nil { 332 return "", nil, err 333 } 334 if msg.Type != schedulerpb.INIT || msg.FrontendAddress == "" { 335 return "", nil, errors.New("no frontend address") 336 } 337 338 level.Debug(s.log).Log("msg", "frontend connected", "address", msg.FrontendAddress) 339 340 s.connectedFrontendsMu.Lock() 341 defer s.connectedFrontendsMu.Unlock() 342 343 cf := s.connectedFrontends[msg.FrontendAddress] 344 if cf == nil { 345 cf = &connectedFrontend{ 346 connections: 0, 347 frontend: frontend, 348 } 349 cf.ctx, cf.cancel = context.WithCancel(context.Background()) 350 s.connectedFrontends[msg.FrontendAddress] = cf 351 } 352 353 cf.connections++ 354 return msg.FrontendAddress, cf.ctx, nil 355 } 356 357 func (s *Scheduler) frontendDisconnected(frontendAddress string) { 358 s.connectedFrontendsMu.Lock() 359 defer s.connectedFrontendsMu.Unlock() 360 361 level.Debug(s.log).Log("msg", "frontend disconnected", "address", frontendAddress) 362 363 cf := s.connectedFrontends[frontendAddress] 364 cf.connections-- 365 if cf.connections == 0 { 366 delete(s.connectedFrontends, frontendAddress) 367 cf.cancel() 368 } 369 } 370 371 func (s *Scheduler) enqueueRequest(frontendContext context.Context, frontendAddr string, msg *schedulerpb.FrontendToScheduler) error { 372 // Create new context for this request, to support cancellation. 373 ctx, cancel := context.WithCancel(frontendContext) 374 shouldCancel := true 375 defer func() { 376 if shouldCancel { 377 cancel() 378 } 379 }() 380 381 // Extract tracing information from headers in HTTP request. FrontendContext doesn't have the correct tracing 382 // information, since that is a long-running request. 383 tracer := opentracing.GlobalTracer() 384 parentSpanContext, err := lokigrpc.GetParentSpanForRequest(tracer, msg.HttpRequest) 385 if err != nil { 386 return err 387 } 388 389 userID := msg.GetUserID() 390 391 req := &schedulerRequest{ 392 frontendAddress: frontendAddr, 393 userID: msg.UserID, 394 queryID: msg.QueryID, 395 request: msg.HttpRequest, 396 statsEnabled: msg.StatsEnabled, 397 } 398 399 now := time.Now() 400 401 req.parentSpanContext = parentSpanContext 402 req.queueSpan, req.ctx = opentracing.StartSpanFromContextWithTracer(ctx, tracer, "queued", opentracing.ChildOf(parentSpanContext)) 403 req.queueTime = now 404 req.ctxCancel = cancel 405 406 // aggregate the max queriers limit in the case of a multi tenant query 407 tenantIDs, err := tenant.TenantIDsFromOrgID(userID) 408 if err != nil { 409 return err 410 } 411 maxQueriers := validation.SmallestPositiveNonZeroIntPerTenant(tenantIDs, s.limits.MaxQueriersPerUser) 412 413 s.activeUsers.UpdateUserTimestamp(userID, now) 414 return s.requestQueue.EnqueueRequest(userID, req, maxQueriers, func() { 415 shouldCancel = false 416 417 s.pendingRequestsMu.Lock() 418 defer s.pendingRequestsMu.Unlock() 419 s.pendingRequests[requestKey{frontendAddr: frontendAddr, queryID: msg.QueryID}] = req 420 }) 421 } 422 423 // This method doesn't do removal from the queue. 424 func (s *Scheduler) cancelRequestAndRemoveFromPending(frontendAddr string, queryID uint64) { 425 s.pendingRequestsMu.Lock() 426 defer s.pendingRequestsMu.Unlock() 427 428 key := requestKey{frontendAddr: frontendAddr, queryID: queryID} 429 req := s.pendingRequests[key] 430 if req != nil { 431 req.ctxCancel() 432 } 433 delete(s.pendingRequests, key) 434 } 435 436 // QuerierLoop is started by querier to receive queries from scheduler. 437 func (s *Scheduler) QuerierLoop(querier schedulerpb.SchedulerForQuerier_QuerierLoopServer) error { 438 resp, err := querier.Recv() 439 if err != nil { 440 return err 441 } 442 443 querierID := resp.GetQuerierID() 444 level.Debug(s.log).Log("msg", "querier connected", "querier", querierID) 445 446 s.requestQueue.RegisterQuerierConnection(querierID) 447 defer s.requestQueue.UnregisterQuerierConnection(querierID) 448 449 lastUserIndex := queue.FirstUser() 450 451 // In stopping state scheduler is not accepting new queries, but still dispatching queries in the queues. 452 for s.isRunningOrStopping() { 453 req, idx, err := s.requestQueue.GetNextRequestForQuerier(querier.Context(), lastUserIndex, querierID) 454 if err != nil { 455 return err 456 } 457 lastUserIndex = idx 458 459 r := req.(*schedulerRequest) 460 461 reqQueueTime := time.Since(r.queueTime) 462 s.queueDuration.Observe(reqQueueTime.Seconds()) 463 r.queueSpan.Finish() 464 465 // Add HTTP header to the request containing the query queue time 466 r.request.Headers = append(r.request.Headers, &httpgrpc.Header{ 467 Key: textproto.CanonicalMIMEHeaderKey(string(lokihttpreq.QueryQueueTimeHTTPHeader)), 468 Values: []string{reqQueueTime.String()}, 469 }) 470 471 /* 472 We want to dequeue the next unexpired request from the chosen tenant queue. 473 The chance of choosing a particular tenant for dequeueing is (1/active_tenants). 474 This is problematic under load, especially with other middleware enabled such as 475 querier.split-by-interval, where one request may fan out into many. 476 If expired requests aren't exhausted before checking another tenant, it would take 477 n_active_tenants * n_expired_requests_at_front_of_queue requests being processed 478 before an active request was handled for the tenant in question. 479 If this tenant meanwhile continued to queue requests, 480 it's possible that it's own queue would perpetually contain only expired requests. 481 */ 482 483 if r.ctx.Err() != nil { 484 // Remove from pending requests. 485 s.cancelRequestAndRemoveFromPending(r.frontendAddress, r.queryID) 486 487 lastUserIndex = lastUserIndex.ReuseLastUser() 488 continue 489 } 490 491 if err := s.forwardRequestToQuerier(querier, r); err != nil { 492 return err 493 } 494 } 495 496 return errSchedulerIsNotRunning 497 } 498 499 func (s *Scheduler) NotifyQuerierShutdown(_ context.Context, req *schedulerpb.NotifyQuerierShutdownRequest) (*schedulerpb.NotifyQuerierShutdownResponse, error) { 500 level.Debug(s.log).Log("msg", "received shutdown notification from querier", "querier", req.GetQuerierID()) 501 s.requestQueue.NotifyQuerierShutdown(req.GetQuerierID()) 502 503 return &schedulerpb.NotifyQuerierShutdownResponse{}, nil 504 } 505 506 func (s *Scheduler) forwardRequestToQuerier(querier schedulerpb.SchedulerForQuerier_QuerierLoopServer, req *schedulerRequest) error { 507 // Make sure to cancel request at the end to cleanup resources. 508 defer s.cancelRequestAndRemoveFromPending(req.frontendAddress, req.queryID) 509 510 // Handle the stream sending & receiving on a goroutine so we can 511 // monitoring the contexts in a select and cancel things appropriately. 512 errCh := make(chan error, 1) 513 go func() { 514 err := querier.Send(&schedulerpb.SchedulerToQuerier{ 515 UserID: req.userID, 516 QueryID: req.queryID, 517 FrontendAddress: req.frontendAddress, 518 HttpRequest: req.request, 519 StatsEnabled: req.statsEnabled, 520 }) 521 if err != nil { 522 errCh <- err 523 return 524 } 525 526 _, err = querier.Recv() 527 errCh <- err 528 }() 529 530 select { 531 case <-req.ctx.Done(): 532 // If the upstream request is cancelled (eg. frontend issued CANCEL or closed connection), 533 // we need to cancel the downstream req. Only way we can do that is to close the stream (by returning error here). 534 // Querier is expecting this semantics. 535 return req.ctx.Err() 536 537 case err := <-errCh: 538 // Is there was an error handling this request due to network IO, 539 // then error out this upstream request _and_ stream. 540 541 if err != nil { 542 s.forwardErrorToFrontend(req.ctx, req, err) 543 } 544 return err 545 } 546 } 547 548 func (s *Scheduler) forwardErrorToFrontend(ctx context.Context, req *schedulerRequest, requestErr error) { 549 opts, err := s.cfg.GRPCClientConfig.DialOption([]grpc.UnaryClientInterceptor{ 550 otgrpc.OpenTracingClientInterceptor(opentracing.GlobalTracer()), 551 middleware.ClientUserHeaderInterceptor, 552 }, 553 nil) 554 if err != nil { 555 level.Warn(s.log).Log("msg", "failed to create gRPC options for the connection to frontend to report error", "frontend", req.frontendAddress, "err", err, "requestErr", requestErr) 556 return 557 } 558 559 conn, err := grpc.DialContext(ctx, req.frontendAddress, opts...) 560 if err != nil { 561 level.Warn(s.log).Log("msg", "failed to create gRPC connection to frontend to report error", "frontend", req.frontendAddress, "err", err, "requestErr", requestErr) 562 return 563 } 564 565 defer func() { 566 _ = conn.Close() 567 }() 568 569 client := frontendv2pb.NewFrontendForQuerierClient(conn) 570 571 userCtx := user.InjectOrgID(ctx, req.userID) 572 _, err = client.QueryResult(userCtx, &frontendv2pb.QueryResultRequest{ 573 QueryID: req.queryID, 574 HttpResponse: &httpgrpc.HTTPResponse{ 575 Code: http.StatusInternalServerError, 576 Body: []byte(requestErr.Error()), 577 }, 578 }) 579 580 if err != nil { 581 level.Warn(s.log).Log("msg", "failed to forward error to frontend", "frontend", req.frontendAddress, "err", err, "requestErr", requestErr) 582 return 583 } 584 } 585 586 func (s *Scheduler) isRunningOrStopping() bool { 587 st := s.State() 588 return st == services.Running || st == services.Stopping 589 } 590 591 func (s *Scheduler) starting(ctx context.Context) (err error) { 592 // In case this function will return error we want to unregister the instance 593 // from the ring. We do it ensuring dependencies are gracefully stopped if they 594 // were already started. 595 defer func() { 596 if err == nil || s.subservices == nil { 597 return 598 } 599 600 if stopErr := services.StopManagerAndAwaitStopped(context.Background(), s.subservices); stopErr != nil { 601 level.Error(s.log).Log("msg", "failed to gracefully stop scheduler dependencies", "err", stopErr) 602 } 603 }() 604 605 if err := services.StartManagerAndAwaitHealthy(ctx, s.subservices); err != nil { 606 return errors.Wrap(err, "unable to start scheduler subservices") 607 } 608 609 if s.cfg.UseSchedulerRing { 610 // The BasicLifecycler does not automatically move state to ACTIVE such that any additional work that 611 // someone wants to do can be done before becoming ACTIVE. For the query scheduler we don't currently 612 // have any additional work so we can become ACTIVE right away. 613 614 // Wait until the ring client detected this instance in the JOINING state to 615 // make sure that when we'll run the initial sync we already know the tokens 616 // assigned to this instance. 617 level.Info(s.log).Log("msg", "waiting until scheduler is JOINING in the ring") 618 if err := ring.WaitInstanceState(ctx, s.ring, s.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil { 619 return err 620 } 621 level.Info(s.log).Log("msg", "scheduler is JOINING in the ring") 622 623 // Change ring state to ACTIVE 624 if err = s.ringLifecycler.ChangeState(ctx, ring.ACTIVE); err != nil { 625 return errors.Wrapf(err, "switch instance to %s in the ring", ring.ACTIVE) 626 } 627 628 // Wait until the ring client detected this instance in the ACTIVE state to 629 // make sure that when we'll run the loop it won't be detected as a ring 630 // topology change. 631 level.Info(s.log).Log("msg", "waiting until scheduler is ACTIVE in the ring") 632 if err := ring.WaitInstanceState(ctx, s.ring, s.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil { 633 return err 634 } 635 level.Info(s.log).Log("msg", "scheduler is ACTIVE in the ring") 636 } 637 638 return nil 639 } 640 641 func (s *Scheduler) running(ctx context.Context) error { 642 // We observe inflight requests frequently and at regular intervals, to have a good 643 // approximation of max inflight requests over percentiles of time. We also do it with 644 // a ticker so that we keep tracking it even if we have no new queries but stuck inflight 645 // requests (eg. queriers are all crashing). 646 inflightRequestsTicker := time.NewTicker(250 * time.Millisecond) 647 defer inflightRequestsTicker.Stop() 648 649 ringCheckTicker := time.NewTicker(ringCheckPeriod) 650 defer ringCheckTicker.Stop() 651 652 for { 653 select { 654 case <-ctx.Done(): 655 return nil 656 case err := <-s.subservicesWatcher.Chan(): 657 return errors.Wrap(err, "scheduler subservice failed") 658 case <-ringCheckTicker.C: 659 if !s.cfg.UseSchedulerRing { 660 continue 661 } 662 isInSet, err := lokiutil.IsInReplicationSet(s.ring, lokiutil.RingKeyOfLeader, s.ringLifecycler.GetInstanceAddr()) 663 if err != nil { 664 level.Error(s.log).Log("msg", "failed to query the ring to see if scheduler instance is in ReplicatonSet, will try again", "err", err) 665 continue 666 } 667 s.setRunState(isInSet) 668 case <-inflightRequestsTicker.C: 669 s.pendingRequestsMu.Lock() 670 inflight := len(s.pendingRequests) 671 s.pendingRequestsMu.Unlock() 672 673 s.inflightRequests.Observe(float64(inflight)) 674 } 675 } 676 } 677 678 func (s *Scheduler) setRunState(isInSet bool) { 679 if isInSet { 680 if s.shouldRun.CAS(false, true) { 681 // Value was swapped, meaning this was a state change from stopped to running. 682 level.Info(s.log).Log("msg", "this scheduler is in the ReplicationSet, will now accept requests.") 683 s.schedulerRunning.Set(1) 684 } 685 } else { 686 if s.shouldRun.CAS(true, false) { 687 // Value was swapped, meaning this was a state change from running to stopped, 688 // we need to send shutdown to all the connected frontends. 689 level.Info(s.log).Log("msg", "this scheduler is no longer in the ReplicationSet, disconnecting frontends, canceling queries and no longer accepting requests.") 690 691 // Send a shutdown message to the connected frontends, there is no way to break the blocking Recv() in the FrontendLoop() 692 // so we send a message to the frontend telling them we are shutting down so they will disconnect. 693 // When FrontendLoop() exits for the connected querier all the inflight queries and queued queries will be canceled. 694 s.connectedFrontendsMu.Lock() 695 defer s.connectedFrontendsMu.Unlock() 696 for _, f := range s.connectedFrontends { 697 // We ignore any errors here because there isn't really an action to take and because 698 // the frontends are also discovering the ring changes and may already be disconnected 699 // or have disconnected. 700 _ = f.frontend.Send(&schedulerpb.SchedulerToFrontend{Status: schedulerpb.SHUTTING_DOWN}) 701 } 702 s.schedulerRunning.Set(0) 703 } 704 } 705 } 706 707 // Close the Scheduler. 708 func (s *Scheduler) stopping(_ error) error { 709 // This will also stop the requests queue, which stop accepting new requests and errors out any pending requests. 710 return services.StopManagerAndAwaitStopped(context.Background(), s.subservices) 711 } 712 713 func (s *Scheduler) cleanupMetricsForInactiveUser(user string) { 714 s.queueLength.DeleteLabelValues(user) 715 s.discardedRequests.DeleteLabelValues(user) 716 } 717 718 func (s *Scheduler) getConnectedFrontendClientsMetric() float64 { 719 s.connectedFrontendsMu.Lock() 720 defer s.connectedFrontendsMu.Unlock() 721 722 count := 0 723 for _, workers := range s.connectedFrontends { 724 count += workers.connections 725 } 726 727 return float64(count) 728 } 729 730 // SafeReadRing does a nil check on the Scheduler before attempting to return it's ring 731 // this is necessary as many callers of this function will only have a valid Scheduler 732 // reference if the QueryScheduler target has been specified, which is not guaranteed 733 func SafeReadRing(s *Scheduler) ring.ReadRing { 734 if s == nil || s.ring == nil || !s.cfg.UseSchedulerRing { 735 return nil 736 } 737 738 return s.ring 739 } 740 741 func (s *Scheduler) OnRingInstanceRegister(_ *ring.BasicLifecycler, ringDesc ring.Desc, instanceExists bool, instanceID string, instanceDesc ring.InstanceDesc) (ring.InstanceState, ring.Tokens) { 742 // When we initialize the scheduler instance in the ring we want to start from 743 // a clean situation, so whatever is the state we set it JOINING, while we keep existing 744 // tokens (if any) or the ones loaded from file. 745 var tokens []uint32 746 if instanceExists { 747 tokens = instanceDesc.GetTokens() 748 } 749 750 takenTokens := ringDesc.GetTokens() 751 newTokens := ring.GenerateTokens(ringNumTokens-len(tokens), takenTokens) 752 753 // Tokens sorting will be enforced by the parent caller. 754 tokens = append(tokens, newTokens...) 755 756 return ring.JOINING, tokens 757 } 758 759 func (s *Scheduler) OnRingInstanceTokens(_ *ring.BasicLifecycler, _ ring.Tokens) {} 760 func (s *Scheduler) OnRingInstanceStopping(_ *ring.BasicLifecycler) {} 761 func (s *Scheduler) OnRingInstanceHeartbeat(_ *ring.BasicLifecycler, _ *ring.Desc, _ *ring.InstanceDesc) { 762 } 763 764 func (s *Scheduler) ServeHTTP(w http.ResponseWriter, req *http.Request) { 765 s.ring.ServeHTTP(w, req) 766 }